@@ -62,6 +62,8 @@
struct ublk_rq_data {
struct llist_node node;
struct callback_head work;
+ bool allocated_bvec;
+ struct io_mapped_buf buf[0];
};
struct ublk_uring_cmd_pdu {
@@ -525,10 +527,87 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data,
return done;
}
-static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
+/*
+ * The built command buffer is immutable, so it is fine to feed it to
+ * concurrent io_uring fused commands
+ */
+static int ublk_init_zero_copy_buffer(struct request *rq)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+ struct io_mapped_buf *imu = data->buf;
+ struct req_iterator rq_iter;
+ unsigned int nr_bvecs = 0;
+ struct bio_vec *bvec;
+ unsigned int offset;
+ struct bio_vec bv;
+
+ if (!ublk_rq_has_data(rq))
+ goto exit;
+
+ rq_for_each_bvec(bv, rq, rq_iter)
+ nr_bvecs++;
+
+ if (!nr_bvecs)
+ goto exit;
+
+ if (rq->bio != rq->biotail) {
+ int idx = 0;
+
+ bvec = kvmalloc_array(sizeof(struct bio_vec), nr_bvecs,
+ GFP_NOIO);
+ if (!bvec)
+ return -ENOMEM;
+
+ offset = 0;
+ rq_for_each_bvec(bv, rq, rq_iter)
+ bvec[idx++] = bv;
+ data->allocated_bvec = true;
+ } else {
+ struct bio *bio = rq->bio;
+
+ offset = bio->bi_iter.bi_bvec_done;
+ bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ }
+ imu->bvec = bvec;
+ imu->nr_bvecs = nr_bvecs;
+ imu->offset = offset;
+ imu->buf = 0;
+ imu->buf_end = blk_rq_bytes(rq);
+
+ return 0;
+exit:
+ imu->bvec = NULL;
+ return 0;
+}
+
+static void ublk_deinit_zero_copy_buffer(struct request *rq)
+{
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+ struct io_mapped_buf *imu = data->buf;
+
+ if (data->allocated_bvec) {
+ kvfree(imu->bvec);
+ data->allocated_bvec = false;
+ }
+}
+
+static int ublk_map_io(const struct ublk_queue *ubq, struct request *req,
struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+
+ if (ubq->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+ int ret = ublk_init_zero_copy_buffer(req);
+
+ /*
+ * The only failure is -ENOMEM for allocating fused cmd
+ * buffer, return zero so that we can requeue this req.
+ */
+ if (unlikely(ret))
+ return 0;
+ return rq_bytes;
+ }
+
/*
* no zero copy, we delay copy WRITE request data into ublksrv
* context and the big benefit is that pinning pages in current
@@ -553,11 +632,17 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
}
static int ublk_unmap_io(const struct ublk_queue *ubq,
- const struct request *req,
+ struct request *req,
struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ubq->flags & UBLK_F_SUPPORT_ZERO_COPY) {
+ ublk_deinit_zero_copy_buffer(req);
+
+ return rq_bytes;
+ }
+
if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
struct ublk_map_data data = {
.ubq = ubq,
@@ -693,6 +778,7 @@ static void ublk_complete_rq(struct request *req)
return;
exit:
+ ublk_deinit_zero_copy_buffer(req);
blk_mq_end_request(req, res);
}
@@ -1259,6 +1345,66 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
ublk_queue_cmd(ubq, req);
}
+static inline bool ublk_check_fused_buf_dir(const struct request *req,
+ unsigned int flags)
+{
+ flags &= IO_URING_F_FUSED;
+
+ if (req_op(req) == REQ_OP_READ && flags == IO_URING_F_FUSED_WRITE)
+ return true;
+
+ if (req_op(req) == REQ_OP_WRITE && flags == IO_URING_F_FUSED_READ)
+ return true;
+
+ return false;
+}
+
+static void ublk_fused_cmd_done_cb(struct io_uring_cmd *cmd)
+{
+ io_uring_cmd_done(cmd, cmd->fused.data.slave_res, 0);
+}
+
+static int ublk_handle_fused_cmd(struct io_uring_cmd *cmd,
+ struct ublk_queue *ubq, int tag, unsigned int issue_flags)
+{
+ struct ublk_device *ub = cmd->file->private_data;
+ struct ublk_rq_data *data;
+ struct request *req;
+
+ if (!ub)
+ return -EPERM;
+
+ if (!(issue_flags & IO_URING_F_FUSED))
+ goto exit;
+
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto exit;
+
+ if (!(ubq->flags & UBLK_F_SUPPORT_ZERO_COPY))
+ goto exit;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req || !blk_mq_request_started(req))
+ goto exit;
+
+ pr_devel("%s: qid %d tag %u request bytes %u, issue flags %x\n",
+ __func__, tag, ubq->q_id, blk_rq_bytes(req),
+ issue_flags);
+
+ if (!ublk_check_fused_buf_dir(req, issue_flags))
+ goto exit;
+
+ if (!ublk_rq_has_data(req))
+ goto exit;
+
+ data = blk_mq_rq_to_pdu(req);
+ io_fused_cmd_provide_kbuf(cmd, !(issue_flags & IO_URING_F_UNLOCKED),
+ data->buf, ublk_fused_cmd_done_cb);
+ return -EIOCBQUEUED;
+exit:
+ return -EINVAL;
+}
+
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
@@ -1277,7 +1423,8 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (!(issue_flags & IO_URING_F_SQE128))
goto out;
- if (issue_flags & IO_URING_F_FUSED)
+ if ((issue_flags & IO_URING_F_FUSED) &&
+ cmd_op != UBLK_IO_FUSED_SUBMIT_IO)
return -EOPNOTSUPP;
if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
@@ -1287,7 +1434,8 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (!ubq || ub_cmd->q_id != ubq->q_id)
goto out;
- if (ubq->ubq_daemon && ubq->ubq_daemon != current)
+ if ((ubq->ubq_daemon && ubq->ubq_daemon != current) &&
+ (cmd_op != UBLK_IO_FUSED_SUBMIT_IO))
goto out;
if (tag >= ubq->q_depth)
@@ -1310,6 +1458,9 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
goto out;
switch (cmd_op) {
+ case UBLK_IO_FUSED_SUBMIT_IO:
+ return ublk_handle_fused_cmd(cmd, ubq, tag, issue_flags);
+
case UBLK_IO_FETCH_REQ:
/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
if (ublk_queue_ready(ubq)) {
@@ -1533,11 +1684,14 @@ static void ublk_align_max_io_size(struct ublk_device *ub)
static int ublk_add_tag_set(struct ublk_device *ub)
{
+ int zc = !!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY);
+ struct ublk_rq_data *data;
+
ub->tag_set.ops = &ublk_mq_ops;
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
+ ub->tag_set.cmd_size = struct_size(data, buf, zc);
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
@@ -1756,9 +1910,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
- /* We are not ready to support zero copy */
- ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
-
ub->dev_info.nr_hw_queues = min_t(unsigned int,
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
@@ -44,6 +44,7 @@
#define UBLK_IO_FETCH_REQ 0x20
#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21
#define UBLK_IO_NEED_GET_DATA 0x22
+#define UBLK_IO_FUSED_SUBMIT_IO 0x23
/* only ABORT means that no re-fetch */
#define UBLK_IO_RES_OK 0
Apply io_uring fused command for supporting zero copy: 1) init the fused cmd buffer(io_mapped_buf) in ublk_map_io(), and deinit it in ublk_unmap_io(), and this buffer is immutable, so it is just fine to retrieve it from concurrent fused command. 1) add sub-command opcode of UBLK_IO_FUSED_SUBMIT_IO for retrieving this fused cmd(zero copy) buffer 2) call io_fused_cmd_provide_kbuf() to provide buffer to slave request; meantime setup complete callback via this API, once slave request is completed, the complete callback is called for freeing the buffer and completing the uring fused command Todo: don't complete ublk block request until all in-flight fused commands aiming this request are completed; this change requires to clean up current ublk driver a bit, so delay this work in future post, and it won't affect reviewing on this whole approach. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/block/ublk_drv.c | 167 ++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 1 + 2 files changed, 160 insertions(+), 8 deletions(-)