diff mbox

[0/6] ublk zero-copy support

Message ID 20250203154517.937623-1-kbusch@meta.com (mailing list archive)
State New
Headers show

Commit Message

Keith Busch Feb. 3, 2025, 3:45 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

This is a new look at supporting zero copy with ublk.

The previous version from Ming can be viewed here:

  https://lore.kernel.org/linux-block/20241107110149.890530-1-ming.lei@redhat.com/

Based on the feedback from that thread, the desired io_uring interfaces
needed to be simpler, and the kernel registered resources need to behave
more similiar to user registered buffers.

This series introduces a new resource node type, KBUF, which, like the
BUFFER resource, needs to be installed into an io_uring buf_node table
in order for the user to access it in a fixed buffer command. The
new io_uring kernel API provides a way for a user to register a struct
request's bvec to a specific index, and a way to unregister it.

When the ublk server receives notification of a new command, it must
first select an index and register the zero copy buffer. It may use that
index for any number of fixed buffer commands, then it must unregister
the index when it's done. This can all be done in a single io_uring_enter
if desired, or it can be split into multiple enters if needed.

The io_uring instance that gets the zero copy registration doesn't
necessarily need to be the same ring that is receiving notifcations from
the ublk_drv module. This allows you to split frontend and backend rings
if desired.

At the end of this cover letter, I've provided a patch to the ublksrv to
demonstrate how to use this.

Jens Axboe (1):
  io_uring: use node for import

Keith Busch (5):
  block: const blk_rq_nr_phys_segments request
  io_uring: add support for kernel registered bvecs
  ublk: zc register/unregister bvec
  io_uring: add abstraction for buf_table rsrc data
  io_uring: cache nodes and mapped buffers

 drivers/block/ublk_drv.c       | 139 +++++++++++++-----
 include/linux/blk-mq.h         |   2 +-
 include/linux/io_uring.h       |   1 +
 include/linux/io_uring_types.h |  25 +++-
 include/uapi/linux/ublk_cmd.h  |   4 +
 io_uring/fdinfo.c              |   8 +-
 io_uring/filetable.c           |   2 +-
 io_uring/net.c                 |   5 +-
 io_uring/nop.c                 |   2 +-
 io_uring/register.c            |   2 +-
 io_uring/rsrc.c                | 259 ++++++++++++++++++++++++++-------
 io_uring/rsrc.h                |   8 +-
 io_uring/rw.c                  |   4 +-
 io_uring/uring_cmd.c           |   4 +-
 14 files changed, 351 insertions(+), 114 deletions(-)
diff mbox

Patch

diff --git a/include/ublk_cmd.h b/include/ublk_cmd.h
index 0150003..07439be 100644
--- a/include/ublk_cmd.h
+++ b/include/ublk_cmd.h
@@ -94,6 +94,10 @@ 
 	_IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd)
 #define	UBLK_U_IO_NEED_GET_DATA		\
 	_IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd)
+#define UBLK_U_IO_REGISTER_IO_BUF	\
+	_IOWR('u', 0x23, struct ublksrv_io_cmd)
+#define UBLK_U_IO_UNREGISTER_IO_BUF	\
+	_IOWR('u', 0x24, struct ublksrv_io_cmd)
 
 /* only ABORT means that no re-fetch */
 #define UBLK_IO_RES_OK			0
diff --git a/include/ublksrv_tgt.h b/include/ublksrv_tgt.h
index 1deee2b..6291531 100644
--- a/include/ublksrv_tgt.h
+++ b/include/ublksrv_tgt.h
@@ -189,4 +189,17 @@  static inline void ublk_get_sqe_pair(struct io_uring *r,
 		*sqe2 = io_uring_get_sqe(r);
 }
 
+static inline void ublk_get_sqe_three(struct io_uring *r,
+		struct io_uring_sqe **sqe1, struct io_uring_sqe **sqe2,
+		struct io_uring_sqe **sqe3)
+{
+	unsigned left = io_uring_sq_space_left(r);
+
+	if (left < 3)
+		io_uring_submit(r);
+
+	*sqe1 = io_uring_get_sqe(r);
+	*sqe2 = io_uring_get_sqe(r);
+	*sqe3 = io_uring_get_sqe(r);
+}
 #endif
diff --git a/lib/ublksrv.c b/lib/ublksrv.c
index 16a9e13..7205247 100644
--- a/lib/ublksrv.c
+++ b/lib/ublksrv.c
@@ -619,6 +619,15 @@  skip_alloc_buf:
 		goto fail;
 	}
 
+	if (ctrl_dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
+		if (ret) {
+			ublk_err("ublk dev %d queue %d register spare buffers failed %d",
+					q->dev->ctrl_dev->dev_info.dev_id, q->q_id, ret);
+			goto fail;
+		}
+	}
+
 	io_uring_register_ring_fd(&q->ring);
 
 	/*
diff --git a/tgt_loop.cpp b/tgt_loop.cpp
index 0f16676..ce44c7d 100644
--- a/tgt_loop.cpp
+++ b/tgt_loop.cpp
@@ -246,12 +246,62 @@  static inline int loop_fallocate_mode(const struct ublksrv_io_desc *iod)
        return mode;
 }
 
+static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe,
+		int dev_fd, int tag, int q_id, __u64 index)
+{
+	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+
+	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
+	sqe->opcode		= IORING_OP_URING_CMD;
+	sqe->flags		|= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE;
+	sqe->cmd_op		= UBLK_U_IO_REGISTER_IO_BUF;
+
+	cmd->tag		= tag;
+	cmd->addr		= index;
+	cmd->q_id		= q_id;
+}
+
+static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe,
+		int dev_fd, int tag, int q_id, __u64 index)
+{
+	struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd;
+
+	io_uring_prep_read(sqe, dev_fd, 0, 0, 0);
+	sqe->opcode		= IORING_OP_URING_CMD;
+	sqe->flags		|= IOSQE_CQE_SKIP_SUCCESS | IOSQE_FIXED_FILE;
+	sqe->cmd_op		= UBLK_U_IO_UNREGISTER_IO_BUF;
+
+	cmd->tag		= tag;
+	cmd->addr		= index;
+	cmd->q_id		= q_id;
+}
+
 static void loop_queue_tgt_read(const struct ublksrv_queue *q,
 		const struct ublksrv_io_desc *iod, int tag)
 {
+	const struct ublksrv_ctrl_dev_info *info =
+		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev));
 	unsigned ublk_op = ublksrv_get_op(iod);
 
-	if (user_copy) {
+	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		struct io_uring_sqe *reg;
+		struct io_uring_sqe *read;
+		struct io_uring_sqe *ureg;
+
+		ublk_get_sqe_three(q->ring_ptr, &reg, &read, &ureg);
+
+		io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag);
+
+		io_uring_prep_read_fixed(read, 1 /*fds[1]*/,
+			0,
+			iod->nr_sectors << 9,
+			iod->start_sector << 9,
+			tag);
+		io_uring_sqe_set_flags(read, IOSQE_FIXED_FILE);
+		read->user_data = build_user_data(tag, ublk_op, 0, 1);
+
+		io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag);
+	} else if (user_copy) {
 		struct io_uring_sqe *sqe, *sqe2;
 		__u64 pos = ublk_pos(q->q_id, tag, 0);
 		void *buf = ublksrv_queue_get_io_buf(q, tag);
@@ -286,9 +336,29 @@  static void loop_queue_tgt_read(const struct ublksrv_queue *q,
 static void loop_queue_tgt_write(const struct ublksrv_queue *q,
 		const struct ublksrv_io_desc *iod, int tag)
 {
+	const struct ublksrv_ctrl_dev_info *info =
+		ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(q->dev));
 	unsigned ublk_op = ublksrv_get_op(iod);
 
-	if (user_copy) {
+	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+		struct io_uring_sqe *reg;
+		struct io_uring_sqe *write;
+		struct io_uring_sqe *ureg;
+
+		ublk_get_sqe_three(q->ring_ptr, &reg, &write, &ureg);
+
+		io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag);
+
+		io_uring_prep_write_fixed(write, 1 /*fds[1]*/,
+			0,
+			iod->nr_sectors << 9,
+			iod->start_sector << 9,
+			tag);
+		io_uring_sqe_set_flags(write, IOSQE_FIXED_FILE);
+		write->user_data = build_user_data(tag, ublk_op, 0, 1);
+
+		io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag);
+	} else if (user_copy) {
 		struct io_uring_sqe *sqe, *sqe2;
 		__u64 pos = ublk_pos(q->q_id, tag, 0);
 		void *buf = ublksrv_queue_get_io_buf(q, tag);
diff --git a/ublksrv_tgt.cpp b/ublksrv_tgt.cpp
index 8f9cf28..f3ebe14 100644
--- a/ublksrv_tgt.cpp
+++ b/ublksrv_tgt.cpp
@@ -723,7 +723,7 @@  static int cmd_dev_add(int argc, char *argv[])
 			data.tgt_type = optarg;
 			break;
 		case 'z':
-			data.flags |= UBLK_F_SUPPORT_ZERO_COPY;
+			data.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY;
 			break;
 		case 'q':
 			data.nr_hw_queues = strtol(optarg, NULL, 10);