Message ID | 20250324134905.766777-7-ming.lei@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | ublk: cleanup & improvement & zc follow-up | expand |
Context | Check | Description |
---|---|---|
shin/vmtest-for-next-PR | success | PR summary |
shin/vmtest-for-next-VM_Test-1 | success | Logs for build-kernel |
shin/vmtest-for-next-VM_Test-0 | success | Logs for build-kernel |
On Mon, Mar 24, 2025 at 09:49:01PM +0800, Ming Lei wrote: > Implement ->queue_rqs() for improving perf in case of MQ. > > In this way, we just need to call io_uring_cmd_complete_in_task() once for > one batch, then both io_uring and ublk server can get exact batch from > client side. > > Follows IOPS improvement: > > - tests > > tools/testing/selftests/ublk/kublk add -t null -q 2 [-z] > > fio/t/io_uring -p0 /dev/ublkb0 > > - results: > > more than 10% IOPS boost observed Nice! > > Pass all ublk selftests, especially the io dispatch order test. > > Cc: Uday Shankar <ushankar@purestorage.com> > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > drivers/block/ublk_drv.c | 85 ++++++++++++++++++++++++++++++++++++---- > 1 file changed, 77 insertions(+), 8 deletions(-) > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > index 53a463681a41..86621fde7fde 100644 > --- a/drivers/block/ublk_drv.c > +++ b/drivers/block/ublk_drv.c > @@ -83,6 +83,7 @@ struct ublk_rq_data { > struct ublk_uring_cmd_pdu { > struct ublk_queue *ubq; > u16 tag; > + struct rq_list list; > }; I think you'll have a hole after tag here and end up eating the full 32 bytes on x86_64. Maybe put list before tag to leave a bit of space for some small fields? Separately I think we should try to consolidate all the per-IO state. It is currently split across struct io_uring_cmd PDU, struct request and its PDU, and struct ublk_io. Maybe we can store everything in struct ublk_io and just put pointers to that where needed. > > /* > @@ -1258,6 +1259,32 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) > io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); > } > > +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > + struct ublk_queue *ubq = pdu->ubq; > + struct request *rq; > + > + while ((rq = rq_list_pop(&pdu->list))) { > + struct ublk_io *io = &ubq->ios[rq->tag]; > + > + ublk_rq_task_work_cb(io->cmd, issue_flags); Maybe rename to ublk_dispatch_one_rq or similar? > + } > +} > + > +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) > +{ > + struct request *rq = l->head; > + struct ublk_io *io = &ubq->ios[rq->tag]; > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); > + > + pdu->ubq = ubq; > + pdu->list = *l; > + rq_list_init(l); > + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); > +} > + > static enum blk_eh_timer_return ublk_timeout(struct request *rq) > { > struct ublk_queue *ubq = rq->mq_hctx->driver_data; > @@ -1296,16 +1323,13 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) > return BLK_EH_RESET_TIMER; > } > > -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > - const struct blk_mq_queue_data *bd) > +static blk_status_t ublk_prep_rq_batch(struct request *rq) > { > - struct ublk_queue *ubq = hctx->driver_data; > - struct request *rq = bd->rq; > + struct ublk_queue *ubq = rq->mq_hctx->driver_data; > blk_status_t res; > > - if (unlikely(ubq->fail_io)) { > + if (unlikely(ubq->fail_io)) > return BLK_STS_TARGET; > - } > > /* fill iod to slot in io cmd buffer */ > res = ublk_setup_iod(ubq, rq); > @@ -1324,17 +1348,58 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) > return BLK_STS_IOERR; > > + if (unlikely(ubq->canceling)) > + return BLK_STS_IOERR; > + > + blk_mq_start_request(rq); > + return BLK_STS_OK; > +} > + > +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > + const struct blk_mq_queue_data *bd) > +{ > + struct ublk_queue *ubq = hctx->driver_data; > + struct request *rq = bd->rq; > + blk_status_t res; > + > if (unlikely(ubq->canceling)) { > __ublk_abort_rq(ubq, rq); > return BLK_STS_OK; > } > > - blk_mq_start_request(bd->rq); > - ublk_queue_cmd(ubq, rq); > + res = ublk_prep_rq_batch(rq); > + if (res != BLK_STS_OK) > + return res; > > + ublk_queue_cmd(ubq, rq); > return BLK_STS_OK; > } > > +static void ublk_queue_rqs(struct rq_list *rqlist) > +{ > + struct rq_list requeue_list = { }; > + struct rq_list submit_list = { }; > + struct ublk_queue *ubq = NULL; > + struct request *req; > + > + while ((req = rq_list_pop(rqlist))) { > + struct ublk_queue *this_q = req->mq_hctx->driver_data; > + > + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) > + ublk_queue_cmd_list(ubq, &submit_list); > + ubq = this_q; > + > + if (ublk_prep_rq_batch(req) == BLK_STS_OK) > + rq_list_add_tail(&submit_list, req); > + else > + rq_list_add_tail(&requeue_list, req); > + } > + > + if (ubq && !rq_list_empty(&submit_list)) > + ublk_queue_cmd_list(ubq, &submit_list); > + *rqlist = requeue_list; > +} > + > static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > unsigned int hctx_idx) > { > @@ -1347,6 +1412,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > > static const struct blk_mq_ops ublk_mq_ops = { > .queue_rq = ublk_queue_rq, > + .queue_rqs = ublk_queue_rqs, > .init_hctx = ublk_init_hctx, > .timeout = ublk_timeout, > }; > @@ -3147,6 +3213,9 @@ static int __init ublk_init(void) > BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + > UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); > > + BUILD_BUG_ON(sizeof(struct ublk_uring_cmd_pdu) > > + sizeof_field(struct io_uring_cmd, pdu)); > + Consider putting this near the struct ublk_uring_cmd_pdu definition or in ublk_get_uring_cmd_pdu. There are helpers provided by io_uring (io_uring_cmd_private_sz_check and io_uring_cmd_to_pdu) you can use. > init_waitqueue_head(&ublk_idr_wq); > > ret = misc_register(&ublk_misc); > -- > 2.47.0 >
On Mon, Mar 24, 2025 at 11:07:21AM -0600, Uday Shankar wrote: > On Mon, Mar 24, 2025 at 09:49:01PM +0800, Ming Lei wrote: > > Implement ->queue_rqs() for improving perf in case of MQ. > > > > In this way, we just need to call io_uring_cmd_complete_in_task() once for > > one batch, then both io_uring and ublk server can get exact batch from > > client side. > > > > Follows IOPS improvement: > > > > - tests > > > > tools/testing/selftests/ublk/kublk add -t null -q 2 [-z] > > > > fio/t/io_uring -p0 /dev/ublkb0 > > > > - results: > > > > more than 10% IOPS boost observed > > Nice! > > > > > Pass all ublk selftests, especially the io dispatch order test. > > > > Cc: Uday Shankar <ushankar@purestorage.com> > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > --- > > drivers/block/ublk_drv.c | 85 ++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 77 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > > index 53a463681a41..86621fde7fde 100644 > > --- a/drivers/block/ublk_drv.c > > +++ b/drivers/block/ublk_drv.c > > @@ -83,6 +83,7 @@ struct ublk_rq_data { > > struct ublk_uring_cmd_pdu { > > struct ublk_queue *ubq; > > u16 tag; > > + struct rq_list list; > > }; > > I think you'll have a hole after tag here and end up eating the full 32 > bytes on x86_64. Maybe put list before tag to leave a bit of space for > some small fields? Actually here the rq_list member can share space with `tag` or even `ubq`, will do that in next version. > > Separately I think we should try to consolidate all the per-IO state. It > is currently split across struct io_uring_cmd PDU, struct request and > its PDU, and struct ublk_io. Maybe we can store everything in struct > ublk_io and just put pointers to that where needed. It is two things between uring_cmd and request: 1) io_uring_cmd & ublk_io has longer lifetime than request, since the command covers both request delivery and notification, for the latter, there isn't request yet for this slot. 2) in this case, it has to be put into command payload because io_uring knows nothing about request, we need to retrieve request or ublk_io from uring_cmd payload 3) both ublk_io and request payload are pre-allocation, which should be avoided asap, so I think it may not be good to put everything into ublk_io. > > > > > /* > > @@ -1258,6 +1259,32 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) > > io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); > > } > > > > +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, > > + unsigned int issue_flags) > > +{ > > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > > + struct ublk_queue *ubq = pdu->ubq; > > + struct request *rq; > > + > > + while ((rq = rq_list_pop(&pdu->list))) { > > + struct ublk_io *io = &ubq->ios[rq->tag]; > > + > > + ublk_rq_task_work_cb(io->cmd, issue_flags); > > Maybe rename to ublk_dispatch_one_rq or similar? ublk_rq_task_work_cb() is used as uring_cmd callback in case of ->queue_rq()... > > > + } > > +} > > + > > +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) > > +{ > > + struct request *rq = l->head; > > + struct ublk_io *io = &ubq->ios[rq->tag]; > > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); > > + > > + pdu->ubq = ubq; > > + pdu->list = *l; > > + rq_list_init(l); > > + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); > > +} > > + > > static enum blk_eh_timer_return ublk_timeout(struct request *rq) > > { > > struct ublk_queue *ubq = rq->mq_hctx->driver_data; > > @@ -1296,16 +1323,13 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) > > return BLK_EH_RESET_TIMER; > > } > > > > -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > - const struct blk_mq_queue_data *bd) > > +static blk_status_t ublk_prep_rq_batch(struct request *rq) > > { > > - struct ublk_queue *ubq = hctx->driver_data; > > - struct request *rq = bd->rq; > > + struct ublk_queue *ubq = rq->mq_hctx->driver_data; > > blk_status_t res; > > > > - if (unlikely(ubq->fail_io)) { > > + if (unlikely(ubq->fail_io)) > > return BLK_STS_TARGET; > > - } > > > > /* fill iod to slot in io cmd buffer */ > > res = ublk_setup_iod(ubq, rq); > > @@ -1324,17 +1348,58 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) > > return BLK_STS_IOERR; > > > > + if (unlikely(ubq->canceling)) > > + return BLK_STS_IOERR; > > + > > + blk_mq_start_request(rq); > > + return BLK_STS_OK; > > +} > > + > > +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > + const struct blk_mq_queue_data *bd) > > +{ > > + struct ublk_queue *ubq = hctx->driver_data; > > + struct request *rq = bd->rq; > > + blk_status_t res; > > + > > if (unlikely(ubq->canceling)) { > > __ublk_abort_rq(ubq, rq); > > return BLK_STS_OK; > > } > > > > - blk_mq_start_request(bd->rq); > > - ublk_queue_cmd(ubq, rq); > > + res = ublk_prep_rq_batch(rq); > > + if (res != BLK_STS_OK) > > + return res; > > > > + ublk_queue_cmd(ubq, rq); > > return BLK_STS_OK; > > } > > > > +static void ublk_queue_rqs(struct rq_list *rqlist) > > +{ > > + struct rq_list requeue_list = { }; > > + struct rq_list submit_list = { }; > > + struct ublk_queue *ubq = NULL; > > + struct request *req; > > + > > + while ((req = rq_list_pop(rqlist))) { > > + struct ublk_queue *this_q = req->mq_hctx->driver_data; > > + > > + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) > > + ublk_queue_cmd_list(ubq, &submit_list); > > + ubq = this_q; > > + > > + if (ublk_prep_rq_batch(req) == BLK_STS_OK) > > + rq_list_add_tail(&submit_list, req); > > + else > > + rq_list_add_tail(&requeue_list, req); > > + } > > + > > + if (ubq && !rq_list_empty(&submit_list)) > > + ublk_queue_cmd_list(ubq, &submit_list); > > + *rqlist = requeue_list; > > +} > > + > > static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > > unsigned int hctx_idx) > > { > > @@ -1347,6 +1412,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > > > > static const struct blk_mq_ops ublk_mq_ops = { > > .queue_rq = ublk_queue_rq, > > + .queue_rqs = ublk_queue_rqs, > > .init_hctx = ublk_init_hctx, > > .timeout = ublk_timeout, > > }; > > @@ -3147,6 +3213,9 @@ static int __init ublk_init(void) > > BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + > > UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); > > > > + BUILD_BUG_ON(sizeof(struct ublk_uring_cmd_pdu) > > > + sizeof_field(struct io_uring_cmd, pdu)); > > + > > Consider putting this near the struct ublk_uring_cmd_pdu definition or > in ublk_get_uring_cmd_pdu. There are helpers provided by io_uring > (io_uring_cmd_private_sz_check and io_uring_cmd_to_pdu) you can use. Yeah, we can do it, but looks the practice is to add build check in init fn. Thanks, Ming
On Mon, Mar 24, 2025 at 6:49 AM Ming Lei <ming.lei@redhat.com> wrote: > > Implement ->queue_rqs() for improving perf in case of MQ. > > In this way, we just need to call io_uring_cmd_complete_in_task() once for > one batch, then both io_uring and ublk server can get exact batch from > client side. > > Follows IOPS improvement: > > - tests > > tools/testing/selftests/ublk/kublk add -t null -q 2 [-z] > > fio/t/io_uring -p0 /dev/ublkb0 > > - results: > > more than 10% IOPS boost observed > > Pass all ublk selftests, especially the io dispatch order test. > > Cc: Uday Shankar <ushankar@purestorage.com> > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > drivers/block/ublk_drv.c | 85 ++++++++++++++++++++++++++++++++++++---- > 1 file changed, 77 insertions(+), 8 deletions(-) > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > index 53a463681a41..86621fde7fde 100644 > --- a/drivers/block/ublk_drv.c > +++ b/drivers/block/ublk_drv.c > @@ -83,6 +83,7 @@ struct ublk_rq_data { > struct ublk_uring_cmd_pdu { > struct ublk_queue *ubq; > u16 tag; > + struct rq_list list; > }; > > /* > @@ -1258,6 +1259,32 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) > io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); > } > > +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, > + unsigned int issue_flags) > +{ > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > + struct ublk_queue *ubq = pdu->ubq; > + struct request *rq; > + > + while ((rq = rq_list_pop(&pdu->list))) { > + struct ublk_io *io = &ubq->ios[rq->tag]; > + > + ublk_rq_task_work_cb(io->cmd, issue_flags); ublk_rq_task_work_cb() is duplicating the lookup of ubq, rq, and io. Could you factor out a helper that takes those values instead of cmd? > + } > +} > + > +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) > +{ > + struct request *rq = l->head; > + struct ublk_io *io = &ubq->ios[rq->tag]; > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); > + > + pdu->ubq = ubq; Why does pdu->ubq need to be set here but not in ublk_queue_cmd()? I would have thought it would already be set to ubq because pdu comes from a rq belonging to this ubq. > + pdu->list = *l; > + rq_list_init(l); > + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); Could store io->cmd in a variable to avoid looking it up twice. > +} > + > static enum blk_eh_timer_return ublk_timeout(struct request *rq) > { > struct ublk_queue *ubq = rq->mq_hctx->driver_data; > @@ -1296,16 +1323,13 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) > return BLK_EH_RESET_TIMER; > } > > -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > - const struct blk_mq_queue_data *bd) > +static blk_status_t ublk_prep_rq_batch(struct request *rq) naming nit: why "batch"? > { > - struct ublk_queue *ubq = hctx->driver_data; > - struct request *rq = bd->rq; > + struct ublk_queue *ubq = rq->mq_hctx->driver_data; > blk_status_t res; > > - if (unlikely(ubq->fail_io)) { > + if (unlikely(ubq->fail_io)) > return BLK_STS_TARGET; > - } > > /* fill iod to slot in io cmd buffer */ > res = ublk_setup_iod(ubq, rq); > @@ -1324,17 +1348,58 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) > return BLK_STS_IOERR; > > + if (unlikely(ubq->canceling)) > + return BLK_STS_IOERR; Why is ubq->cancelling treated differently for ->queue_rq() vs. ->queue_rqs()? > + > + blk_mq_start_request(rq); > + return BLK_STS_OK; > +} > + > +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > + const struct blk_mq_queue_data *bd) > +{ > + struct ublk_queue *ubq = hctx->driver_data; > + struct request *rq = bd->rq; > + blk_status_t res; > + > if (unlikely(ubq->canceling)) { > __ublk_abort_rq(ubq, rq); > return BLK_STS_OK; > } > > - blk_mq_start_request(bd->rq); > - ublk_queue_cmd(ubq, rq); > + res = ublk_prep_rq_batch(rq); > + if (res != BLK_STS_OK) > + return res; > > + ublk_queue_cmd(ubq, rq); > return BLK_STS_OK; > } > > +static void ublk_queue_rqs(struct rq_list *rqlist) > +{ > + struct rq_list requeue_list = { }; > + struct rq_list submit_list = { }; > + struct ublk_queue *ubq = NULL; > + struct request *req; > + > + while ((req = rq_list_pop(rqlist))) { > + struct ublk_queue *this_q = req->mq_hctx->driver_data; > + > + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) > + ublk_queue_cmd_list(ubq, &submit_list); > + ubq = this_q; Probably could avoid the extra ->driver_data dereference on every rq by comparing the mq_hctx pointers instead. The ->driver_data dereference could be moved to the ublk_queue_cmd_list() calls. > + > + if (ublk_prep_rq_batch(req) == BLK_STS_OK) > + rq_list_add_tail(&submit_list, req); > + else > + rq_list_add_tail(&requeue_list, req); > + } > + > + if (ubq && !rq_list_empty(&submit_list)) > + ublk_queue_cmd_list(ubq, &submit_list); > + *rqlist = requeue_list; > +} > + > static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > unsigned int hctx_idx) > { > @@ -1347,6 +1412,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > > static const struct blk_mq_ops ublk_mq_ops = { > .queue_rq = ublk_queue_rq, > + .queue_rqs = ublk_queue_rqs, > .init_hctx = ublk_init_hctx, > .timeout = ublk_timeout, > }; > @@ -3147,6 +3213,9 @@ static int __init ublk_init(void) > BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + > UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); > > + BUILD_BUG_ON(sizeof(struct ublk_uring_cmd_pdu) > > + sizeof_field(struct io_uring_cmd, pdu)); Looks like Uday also suggested this, but if you change ublk_get_uring_cmd_pdu() to use io_uring_cmd_to_pdu(), you get this check for free. Best, Caleb > + > init_waitqueue_head(&ublk_idr_wq); > > ret = misc_register(&ublk_misc); > -- > 2.47.0 >
On Wed, Mar 26, 2025 at 02:30:56PM -0700, Caleb Sander Mateos wrote: > On Mon, Mar 24, 2025 at 6:49 AM Ming Lei <ming.lei@redhat.com> wrote: > > > > Implement ->queue_rqs() for improving perf in case of MQ. > > > > In this way, we just need to call io_uring_cmd_complete_in_task() once for > > one batch, then both io_uring and ublk server can get exact batch from > > client side. > > > > Follows IOPS improvement: > > > > - tests > > > > tools/testing/selftests/ublk/kublk add -t null -q 2 [-z] > > > > fio/t/io_uring -p0 /dev/ublkb0 > > > > - results: > > > > more than 10% IOPS boost observed > > > > Pass all ublk selftests, especially the io dispatch order test. > > > > Cc: Uday Shankar <ushankar@purestorage.com> > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > --- > > drivers/block/ublk_drv.c | 85 ++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 77 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c > > index 53a463681a41..86621fde7fde 100644 > > --- a/drivers/block/ublk_drv.c > > +++ b/drivers/block/ublk_drv.c > > @@ -83,6 +83,7 @@ struct ublk_rq_data { > > struct ublk_uring_cmd_pdu { > > struct ublk_queue *ubq; > > u16 tag; > > + struct rq_list list; > > }; > > > > /* > > @@ -1258,6 +1259,32 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) > > io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); > > } > > > > +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, > > + unsigned int issue_flags) > > +{ > > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); > > + struct ublk_queue *ubq = pdu->ubq; > > + struct request *rq; > > + > > + while ((rq = rq_list_pop(&pdu->list))) { > > + struct ublk_io *io = &ubq->ios[rq->tag]; > > + > > + ublk_rq_task_work_cb(io->cmd, issue_flags); > > ublk_rq_task_work_cb() is duplicating the lookup of ubq, rq, and io. > Could you factor out a helper that takes those values instead of cmd? > > > + } > > +} > > + > > +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) > > +{ > > + struct request *rq = l->head; > > + struct ublk_io *io = &ubq->ios[rq->tag]; > > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); > > + > > + pdu->ubq = ubq; > > Why does pdu->ubq need to be set here but not in ublk_queue_cmd()? I > would have thought it would already be set to ubq because pdu comes > from a rq belonging to this ubq. > > > + pdu->list = *l; > > + rq_list_init(l); > > + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); > > Could store io->cmd in a variable to avoid looking it up twice. > > > +} > > + > > static enum blk_eh_timer_return ublk_timeout(struct request *rq) > > { > > struct ublk_queue *ubq = rq->mq_hctx->driver_data; > > @@ -1296,16 +1323,13 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) > > return BLK_EH_RESET_TIMER; > > } > > > > -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > - const struct blk_mq_queue_data *bd) > > +static blk_status_t ublk_prep_rq_batch(struct request *rq) > > naming nit: why "batch"? All were handled in my local version. > > > { > > - struct ublk_queue *ubq = hctx->driver_data; > > - struct request *rq = bd->rq; > > + struct ublk_queue *ubq = rq->mq_hctx->driver_data; > > blk_status_t res; > > > > - if (unlikely(ubq->fail_io)) { > > + if (unlikely(ubq->fail_io)) > > return BLK_STS_TARGET; > > - } > > > > /* fill iod to slot in io cmd buffer */ > > res = ublk_setup_iod(ubq, rq); > > @@ -1324,17 +1348,58 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) > > return BLK_STS_IOERR; > > > > + if (unlikely(ubq->canceling)) > > + return BLK_STS_IOERR; > > Why is ubq->cancelling treated differently for ->queue_rq() vs. ->queue_rqs()? It is same, just ublk_queue_rqs() becomes simpler by letting ->queue_rqs() to handle ubq->canceling. Here it is really something which need to comment for ubq->canceling handling, which has to be done after ->fail_io/->force_abort is dealt with, otherwise IO hang is caused when removing disk. That is one bug introduced in this patch. > > > + > > + blk_mq_start_request(rq); > > + return BLK_STS_OK; > > +} > > + > > +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, > > + const struct blk_mq_queue_data *bd) > > +{ > > + struct ublk_queue *ubq = hctx->driver_data; > > + struct request *rq = bd->rq; > > + blk_status_t res; > > + > > if (unlikely(ubq->canceling)) { > > __ublk_abort_rq(ubq, rq); > > return BLK_STS_OK; > > } > > > > - blk_mq_start_request(bd->rq); > > - ublk_queue_cmd(ubq, rq); > > + res = ublk_prep_rq_batch(rq); > > + if (res != BLK_STS_OK) > > + return res; > > > > + ublk_queue_cmd(ubq, rq); > > return BLK_STS_OK; > > } > > > > +static void ublk_queue_rqs(struct rq_list *rqlist) > > +{ > > + struct rq_list requeue_list = { }; > > + struct rq_list submit_list = { }; > > + struct ublk_queue *ubq = NULL; > > + struct request *req; > > + > > + while ((req = rq_list_pop(rqlist))) { > > + struct ublk_queue *this_q = req->mq_hctx->driver_data; > > + > > + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) > > + ublk_queue_cmd_list(ubq, &submit_list); > > + ubq = this_q; > > Probably could avoid the extra ->driver_data dereference on every rq > by comparing the mq_hctx pointers instead. The ->driver_data > dereference could be moved to the ublk_queue_cmd_list() calls. Yes. > > > + > > + if (ublk_prep_rq_batch(req) == BLK_STS_OK) > > + rq_list_add_tail(&submit_list, req); > > + else > > + rq_list_add_tail(&requeue_list, req); > > + } > > + > > + if (ubq && !rq_list_empty(&submit_list)) > > + ublk_queue_cmd_list(ubq, &submit_list); > > + *rqlist = requeue_list; > > +} > > + > > static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > > unsigned int hctx_idx) > > { > > @@ -1347,6 +1412,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, > > > > static const struct blk_mq_ops ublk_mq_ops = { > > .queue_rq = ublk_queue_rq, > > + .queue_rqs = ublk_queue_rqs, > > .init_hctx = ublk_init_hctx, > > .timeout = ublk_timeout, > > }; > > @@ -3147,6 +3213,9 @@ static int __init ublk_init(void) > > BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + > > UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); > > > > + BUILD_BUG_ON(sizeof(struct ublk_uring_cmd_pdu) > > > + sizeof_field(struct io_uring_cmd, pdu)); > > Looks like Uday also suggested this, but if you change > ublk_get_uring_cmd_pdu() to use io_uring_cmd_to_pdu(), you get this > check for free. I have followed Uday's suggestion to patch ublk_get_uring_cmd_pdu(), and will send out v2 soon. Thanks, Ming
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 53a463681a41..86621fde7fde 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -83,6 +83,7 @@ struct ublk_rq_data { struct ublk_uring_cmd_pdu { struct ublk_queue *ubq; u16 tag; + struct rq_list list; }; /* @@ -1258,6 +1259,32 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); } +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + struct request *rq; + + while ((rq = rq_list_pop(&pdu->list))) { + struct ublk_io *io = &ubq->ios[rq->tag]; + + ublk_rq_task_work_cb(io->cmd, issue_flags); + } +} + +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + struct request *rq = l->head; + struct ublk_io *io = &ubq->ios[rq->tag]; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); + + pdu->ubq = ubq; + pdu->list = *l; + rq_list_init(l); + io_uring_cmd_complete_in_task(io->cmd, ublk_cmd_list_tw_cb); +} + static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; @@ -1296,16 +1323,13 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_RESET_TIMER; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static blk_status_t ublk_prep_rq_batch(struct request *rq) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; blk_status_t res; - if (unlikely(ubq->fail_io)) { + if (unlikely(ubq->fail_io)) return BLK_STS_TARGET; - } /* fill iod to slot in io cmd buffer */ res = ublk_setup_iod(ubq, rq); @@ -1324,17 +1348,58 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; + if (unlikely(ubq->canceling)) + return BLK_STS_IOERR; + + blk_mq_start_request(rq); + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; + if (unlikely(ubq->canceling)) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - blk_mq_start_request(bd->rq); - ublk_queue_cmd(ubq, rq); + res = ublk_prep_rq_batch(rq); + if (res != BLK_STS_OK) + return res; + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } +static void ublk_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + + if (ublk_prep_rq_batch(req) == BLK_STS_OK) + rq_list_add_tail(&submit_list, req); + else + rq_list_add_tail(&requeue_list, req); + } + + if (ubq && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1347,6 +1412,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_queue_rqs, .init_hctx = ublk_init_hctx, .timeout = ublk_timeout, }; @@ -3147,6 +3213,9 @@ static int __init ublk_init(void) BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + BUILD_BUG_ON(sizeof(struct ublk_uring_cmd_pdu) > + sizeof_field(struct io_uring_cmd, pdu)); + init_waitqueue_head(&ublk_idr_wq); ret = misc_register(&ublk_misc);
Implement ->queue_rqs() for improving perf in case of MQ. In this way, we just need to call io_uring_cmd_complete_in_task() once for one batch, then both io_uring and ublk server can get exact batch from client side. Follows IOPS improvement: - tests tools/testing/selftests/ublk/kublk add -t null -q 2 [-z] fio/t/io_uring -p0 /dev/ublkb0 - results: more than 10% IOPS boost observed Pass all ublk selftests, especially the io dispatch order test. Cc: Uday Shankar <ushankar@purestorage.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/block/ublk_drv.c | 85 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 77 insertions(+), 8 deletions(-)