diff mbox series

[v4,3/3] nvme/ioctl: move fixed buffer lookup to nvme_uring_cmd_io()

Message ID 20250328154647.2590171-4-csander@purestorage.com (mailing list archive)
State New
Headers show
Series nvme_map_user_request() cleanup | expand

Commit Message

Caleb Sander Mateos March 28, 2025, 3:46 p.m. UTC
nvme_map_user_request() is called from both nvme_submit_user_cmd() and
nvme_uring_cmd_io(). But the ioucmd branch is only applicable to
nvme_uring_cmd_io(). Move it to nvme_uring_cmd_io() and just pass the
resulting iov_iter to nvme_map_user_request().

For NVMe passthru operations with fixed buffers, the fixed buffer lookup
happens in io_uring_cmd_import_fixed(). But nvme_uring_cmd_io() can
return -EAGAIN first from nvme_alloc_user_request() if all tags in the
tag set are in use. This ordering difference is observable when using
UBLK_U_IO_{,UN}REGISTER_IO_BUF SQEs to modify the fixed buffer table. If
the NVMe passthru operation is followed by UBLK_U_IO_UNREGISTER_IO_BUF
to unregister the fixed buffer and the NVMe passthru goes async, the
fixed buffer lookup will fail because it happens after the unregister.

Userspace should not depend on the order in which io_uring issues SQEs
submitted in parallel, but it may try submitting the SQEs together and
fall back on a slow path if the fixed buffer lookup fails. To make the
fast path more likely, do the import before nvme_alloc_user_request().

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
---
 drivers/nvme/host/ioctl.c | 45 +++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

Comments

Kanchan Joshi March 31, 2025, 6:46 a.m. UTC | #1
On 3/28/2025 9:16 PM, Caleb Sander Mateos wrote:
> For NVMe passthru operations with fixed buffers, the fixed buffer lookup
> happens in io_uring_cmd_import_fixed(). But nvme_uring_cmd_io() can
> return -EAGAIN first from nvme_alloc_user_request() if all tags in the
> tag set are in use. This ordering difference is observable when using
> UBLK_U_IO_{,UN}REGISTER_IO_BUF SQEs to modify the fixed buffer table. If
> the NVMe passthru operation is followed by UBLK_U_IO_UNREGISTER_IO_BUF
> to unregister the fixed buffer and the NVMe passthru goes async, the
> fixed buffer lookup will fail because it happens after the unregister.

while the patch looks fine, I wonder what setup is required to 
trigger/test this. Given that io_uring NVMe passthru is on the char 
device node, and ublk does not take char device as the backing file. 
Care to explain?
Keith Busch March 31, 2025, 2:36 p.m. UTC | #2
On Mon, Mar 31, 2025 at 12:16:58PM +0530, Kanchan Joshi wrote:
> On 3/28/2025 9:16 PM, Caleb Sander Mateos wrote:
> > For NVMe passthru operations with fixed buffers, the fixed buffer lookup
> > happens in io_uring_cmd_import_fixed(). But nvme_uring_cmd_io() can
> > return -EAGAIN first from nvme_alloc_user_request() if all tags in the
> > tag set are in use. This ordering difference is observable when using
> > UBLK_U_IO_{,UN}REGISTER_IO_BUF SQEs to modify the fixed buffer table. If
> > the NVMe passthru operation is followed by UBLK_U_IO_UNREGISTER_IO_BUF
> > to unregister the fixed buffer and the NVMe passthru goes async, the
> > fixed buffer lookup will fail because it happens after the unregister.
> 
> while the patch looks fine, I wonder what setup is required to 
> trigger/test this. Given that io_uring NVMe passthru is on the char 
> device node, and ublk does not take char device as the backing file. 
> Care to explain?

Not sure I understand the question. A ublk daemon can use anything it
wants on the backend. Are you just referring to the public ublksrv
implementation? That's not used here, if that's what you mean.
Kanchan Joshi April 2, 2025, 1:21 p.m. UTC | #3
On 3/31/2025 8:06 PM, Keith Busch wrote:
> On Mon, Mar 31, 2025 at 12:16:58PM +0530, Kanchan Joshi wrote:
>> On 3/28/2025 9:16 PM, Caleb Sander Mateos wrote:
>>> For NVMe passthru operations with fixed buffers, the fixed buffer lookup
>>> happens in io_uring_cmd_import_fixed(). But nvme_uring_cmd_io() can
>>> return -EAGAIN first from nvme_alloc_user_request() if all tags in the
>>> tag set are in use. This ordering difference is observable when using
>>> UBLK_U_IO_{,UN}REGISTER_IO_BUF SQEs to modify the fixed buffer table. If
>>> the NVMe passthru operation is followed by UBLK_U_IO_UNREGISTER_IO_BUF
>>> to unregister the fixed buffer and the NVMe passthru goes async, the
>>> fixed buffer lookup will fail because it happens after the unregister.
>> while the patch looks fine, I wonder what setup is required to
>> trigger/test this. Given that io_uring NVMe passthru is on the char
>> device node, and ublk does not take char device as the backing file.
>> Care to explain?
> Not sure I understand the question. A ublk daemon can use anything it
> wants on the backend. Are you just referring to the public ublksrv
> implementation? That's not used here, if that's what you mean.

got it, I did not think beyond public ublksrv.
The userspace block over nvme char-device sounds interesting.
diff mbox series

Patch

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 42dfd29ed39e..400c3df0e58f 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -112,12 +112,11 @@  static struct request *nvme_alloc_user_request(struct request_queue *q,
 	return req;
 }
 
 static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		struct io_uring_cmd *ioucmd, unsigned int flags,
-		unsigned int iou_issue_flags)
+		struct iov_iter *iter, unsigned int flags)
 {
 	struct request_queue *q = req->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
 	bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
@@ -135,28 +134,16 @@  static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		if (!nvme_ctrl_meta_sgl_supported(ctrl))
 			dev_warn_once(ctrl->device,
 				      "using unchecked metadata buffer\n");
 	}
 
-	if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
-		struct iov_iter iter;
-
-		/* fixedbufs is only for non-vectored io */
-		if (flags & NVME_IOCTL_VEC)
-			return -EINVAL;
-
-		ret = io_uring_cmd_import_fixed(ubuffer, bufflen,
-				rq_data_dir(req), &iter, ioucmd,
-				iou_issue_flags);
-		if (ret < 0)
-			return ret;
-		ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL);
-	} else {
+	if (iter)
+		ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL);
+	else
 		ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer),
 				bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0,
 				0, rq_data_dir(req));
-	}
 
 	if (ret)
 		return ret;
 
 	bio = req->bio;
@@ -194,11 +181,11 @@  static int nvme_submit_user_cmd(struct request_queue *q,
 		return PTR_ERR(req);
 
 	req->timeout = timeout;
 	if (ubuffer && bufflen) {
 		ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
-				meta_len, NULL, flags, 0);
+				meta_len, NULL, flags);
 		if (ret)
 			goto out_free_req;
 	}
 
 	bio = req->bio;
@@ -467,10 +454,12 @@  static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
 	const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe);
 	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
 	struct nvme_uring_data d;
 	struct nvme_command c;
+	struct iov_iter iter;
+	struct iov_iter *map_iter = NULL;
 	struct request *req;
 	blk_opf_t rq_flags = REQ_ALLOC_CACHE;
 	blk_mq_req_flags_t blk_flags = 0;
 	int ret;
 
@@ -502,10 +491,24 @@  static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	d.addr = READ_ONCE(cmd->addr);
 	d.data_len = READ_ONCE(cmd->data_len);
 	d.metadata_len = READ_ONCE(cmd->metadata_len);
 	d.timeout_ms = READ_ONCE(cmd->timeout_ms);
 
+	if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
+		/* fixedbufs is only for non-vectored io */
+		if (vec)
+			return -EINVAL;
+
+		ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
+			nvme_is_write(&c) ? WRITE : READ, &iter, ioucmd,
+			issue_flags);
+		if (ret < 0)
+			return ret;
+
+		map_iter = &iter;
+	}
+
 	if (issue_flags & IO_URING_F_NONBLOCK) {
 		rq_flags |= REQ_NOWAIT;
 		blk_flags = BLK_MQ_REQ_NOWAIT;
 	}
 	if (issue_flags & IO_URING_F_IOPOLL)
@@ -515,13 +518,13 @@  static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0;
 
 	if (d.data_len) {
-		ret = nvme_map_user_request(req, d.addr,
-			d.data_len, nvme_to_user_ptr(d.metadata),
-			d.metadata_len, ioucmd, vec, issue_flags);
+		ret = nvme_map_user_request(req, d.addr, d.data_len,
+			nvme_to_user_ptr(d.metadata), d.metadata_len,
+			map_iter, vec);
 		if (ret)
 			goto out_free_req;
 	}
 
 	/* to free bio on completion, as req->bio will be null at that time */