Message ID | 20210316031523.864506-9-ming.lei@redhat.com (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Mike Snitzer |
Headers | show |
Series | block: support bio based io polling | expand |
It is a giant progress to gather all split bios that need to be polled in a per-task queue. Still some comments below. On 3/16/21 11:15 AM, Ming Lei wrote: > Currently bio based IO poll needs to poll all hw queue blindly, this way > is very inefficient, and the big reason is that we can't pass bio > submission result to io poll task. > > In IO submission context, store associated underlying bios into the > submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > and return current->pid to caller of submit_bio() for any DM or bio based > driver's IO, which is submitted from FS. > > In IO poll context, the passed cookie tells us the PID of submission > context, and we can find the bio from that submission context. Moving > bio from submission queue to poll queue of the poll context, and keep > polling until these bios are ended. Remove bio from poll queue if the > bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > > Usually submission shares context with io poll. The per-task poll context > is just like stack variable, and it is cheap to move data between the two > per-task queues. > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > block/bio.c | 5 ++ > block/blk-core.c | 74 +++++++++++++++++- > block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > include/linux/blk_types.h | 3 + > 4 files changed, 235 insertions(+), 3 deletions(-) > > diff --git a/block/bio.c b/block/bio.c > index a1c4d2900c7a..bcf5eca0e8e3 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > **/ > void bio_endio(struct bio *bio) > { > + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > + if (bio_flagged(bio, BIO_END_BY_POLL)) { > + bio_set_flag(bio, BIO_DONE); > + return; > + } > again: > if (!bio_remaining_done(bio)) > return; > diff --git a/block/blk-core.c b/block/blk-core.c > index a082bbc856fb..970b23fa2e6e 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > bio->bi_opf |= REQ_TAG; > } > > +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > +{ > + struct blk_bio_poll_data data = { > + .bio = bio, > + }; > + struct blk_bio_poll_ctx *pc = ioc->data; > + unsigned int queued; > + > + /* lock is required if there is more than one writer */ > + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > + spin_lock(&pc->lock); > + queued = kfifo_put(&pc->sq, data); > + spin_unlock(&pc->lock); > + } else { > + queued = kfifo_put(&pc->sq, data); > + } > + > + /* > + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > + * so we can save cookie into this bio after submit_bio(). > + */ > + if (queued) > + bio_set_flag(bio, BIO_END_BY_POLL); > + else > + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > + > + return queued; > +} The size of kfifo is limited, and it seems that once the sq of kfifio is full, REQ_HIPRI flag is cleared and the corresponding bio is actually enqueued into the default hw queue, which is IRQ driven. > + > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) > +{ > + bio->bi_iter.bi_private_data = cookie; > +} > + > static noinline_for_stack bool submit_bio_checks(struct bio *bio) > { > struct block_device *bdev = bio->bi_bdev; > @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) > * bio_list_on_stack[1] contains bios that were submitted before the current > * ->submit_bio_bio, but that haven't been processed yet. > */ > -static blk_qc_t __submit_bio_noacct(struct bio *bio) > +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) > { > struct bio_list bio_list_on_stack[2]; > blk_qc_t ret = BLK_QC_T_NONE; > @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > bio_list_on_stack[1] = bio_list_on_stack[0]; > bio_list_init(&bio_list_on_stack[0]); > > - ret = __submit_bio(bio); > + if (ioc && queue_is_mq(q) && > + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { > + bool queued = blk_bio_poll_prep_submit(ioc, bio); > + > + ret = __submit_bio(bio); > + if (queued) > + blk_bio_poll_post_submit(bio, ret); > + } else { > + ret = __submit_bio(bio); > + } > > /* > * Sort new bios into those for a lower level and those for the > @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > return ret; > } > > +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, > + struct io_context *ioc) > +{ > + struct blk_bio_poll_ctx *pc = ioc->data; > + int entries = kfifo_len(&pc->sq); > + > + __submit_bio_noacct_int(bio, ioc); > + > + /* bio submissions queued to per-task poll context */ > + if (kfifo_len(&pc->sq) > entries) > + return current->pid; > + > + /* swapper's pid is 0, but it can't submit poll IO for us */ > + return 0; > +} > + > +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) > +{ > + struct io_context *ioc = current->io_context; > + > + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) > + return __submit_bio_noacct_poll(bio, ioc); > + > + return __submit_bio_noacct_int(bio, NULL); > +} > + > + > static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) > { > struct bio_list bio_list[2] = { }; > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 03f59915fe2c..4e6f1467d303 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, > return ret; > } > > +static blk_qc_t bio_get_poll_cookie(struct bio *bio) > +{ > + return bio->bi_iter.bi_private_data; > +} > + > +static int blk_mq_poll_io(struct bio *bio) > +{ > + struct request_queue *q = bio->bi_bdev->bd_disk->queue; > + blk_qc_t cookie = bio_get_poll_cookie(bio); > + int ret = 0; > + > + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { > + struct blk_mq_hw_ctx *hctx = > + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; > + > + ret += blk_mq_poll_hctx(q, hctx); > + } > + return ret; > +} > + > +static int blk_bio_poll_and_end_io(struct request_queue *q, > + struct blk_bio_poll_ctx *poll_ctx) > +{ > + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; > + int ret = 0; > + int i; > + > + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { > + struct bio *bio = poll_data[i].bio; > + > + if (!bio) > + continue; > + > + ret += blk_mq_poll_io(bio); > + if (bio_flagged(bio, BIO_DONE)) { > + poll_data[i].bio = NULL; > + > + /* clear BIO_END_BY_POLL and end me really */ > + bio_clear_flag(bio, BIO_END_BY_POLL); > + bio_endio(bio); > + } > + } > + return ret; > +} When there are multiple threads polling, saying thread A and thread B, then there's one bio which should be polled by thread A (the pid is passed to thread A), while it's actually completed by thread B. In this case, when the bio is completed by thread B, the bio is not really completed and one extra blk_poll() still needs to be called. > + > +static int __blk_bio_poll_io(struct request_queue *q, > + struct blk_bio_poll_ctx *submit_ctx, > + struct blk_bio_poll_ctx *poll_ctx) > +{ > + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; > + int i; > + > + /* > + * Move IO submission result from submission queue in submission > + * context to poll queue of poll context. > + * > + * There may be more than one readers on poll queue of the same > + * submission context, so have to lock here. > + */ > + spin_lock(&submit_ctx->lock); > + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { > + if (poll_data[i].bio == NULL && > + !kfifo_get(&submit_ctx->sq, &poll_data[i])) > + break; > + } > + spin_unlock(&submit_ctx->lock); > + > + return blk_bio_poll_and_end_io(q, poll_ctx); > +} > + > +static int blk_bio_poll_io(struct request_queue *q, > + struct io_context *submit_ioc, > + struct io_context *poll_ioc) > +{ > + struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data; > + struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data; > + int ret; > + > + if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) { > + mutex_lock(&poll_ctx->pq_lock); > + ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx); > + mutex_unlock(&poll_ctx->pq_lock); > + } else { > + ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx); > + } > + return ret; > +} > + > +static bool blk_bio_ioc_valid(struct task_struct *t) > +{ > + if (!t) > + return false; > + > + if (!t->io_context) > + return false; > + > + if (!t->io_context->data) > + return false; > + > + return true; > +} > + > +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie) > +{ > + struct io_context *poll_ioc = current->io_context; > + pid_t pid; > + struct task_struct *submit_task; > + int ret; > + > + pid = (pid_t)cookie; > + > + /* io poll often share io submission context */ > + if (likely(current->pid == pid && blk_bio_ioc_valid(current))) > + return blk_bio_poll_io(q, poll_ioc, poll_ioc); > + > + submit_task = find_get_task_by_vpid(pid); > + if (likely(blk_bio_ioc_valid(submit_task))) > + ret = blk_bio_poll_io(q, submit_task->io_context, > + poll_ioc); > + else > + ret = 0; > + > + put_task_struct(submit_task); > + > + return ret; > +} > + > static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin) > { > + long state; > + > + /* no need to poll */ > + if (cookie == 0) > + return 0; > + > /* > * Create poll queue for storing poll bio and its cookie from > * submission queue > */ > blk_create_io_context(q, true); > > + state = current->state; > + do { > + int ret; > + > + ret = __blk_bio_poll(q, cookie); > + if (ret > 0) { > + __set_current_state(TASK_RUNNING); > + return ret; > + } > + > + if (signal_pending_state(state, current)) > + __set_current_state(TASK_RUNNING); > + > + if (current->state == TASK_RUNNING) > + return 1; > + if (ret < 0 || !spin) > + break; > + cpu_relax(); > + } while (!need_resched()); > + > + __set_current_state(TASK_RUNNING); > return 0; > } > > @@ -3893,7 +4047,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) > struct blk_mq_hw_ctx *hctx; > long state; > > - if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q)) > + if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie))) > return 0; > > if (current->plug) > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index a1bcade4bcc3..53f64eea9652 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -304,6 +304,9 @@ enum { > BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ > BIO_TRACKED, /* set if bio goes through the rq_qos path */ > BIO_REMAPPED, > + BIO_END_BY_POLL, /* end by blk_bio_poll() explicitly */ > + /* set when bio can be ended, used for bio with BIO_END_BY_POLL */ > + BIO_DONE, > BIO_FLAG_LAST > }; > >
On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: > It is a giant progress to gather all split bios that need to be polled > in a per-task queue. Still some comments below. > > > On 3/16/21 11:15 AM, Ming Lei wrote: > > Currently bio based IO poll needs to poll all hw queue blindly, this way > > is very inefficient, and the big reason is that we can't pass bio > > submission result to io poll task. > > > > In IO submission context, store associated underlying bios into the > > submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > > and return current->pid to caller of submit_bio() for any DM or bio based > > driver's IO, which is submitted from FS. > > > > In IO poll context, the passed cookie tells us the PID of submission > > context, and we can find the bio from that submission context. Moving > > bio from submission queue to poll queue of the poll context, and keep > > polling until these bios are ended. Remove bio from poll queue if the > > bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > > > > Usually submission shares context with io poll. The per-task poll context > > is just like stack variable, and it is cheap to move data between the two > > per-task queues. > > > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > --- > > block/bio.c | 5 ++ > > block/blk-core.c | 74 +++++++++++++++++- > > block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > > include/linux/blk_types.h | 3 + > > 4 files changed, 235 insertions(+), 3 deletions(-) > > > > diff --git a/block/bio.c b/block/bio.c > > index a1c4d2900c7a..bcf5eca0e8e3 100644 > > --- a/block/bio.c > > +++ b/block/bio.c > > @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > > **/ > > void bio_endio(struct bio *bio) > > { > > + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > > + if (bio_flagged(bio, BIO_END_BY_POLL)) { > > + bio_set_flag(bio, BIO_DONE); > > + return; > > + } > > again: > > if (!bio_remaining_done(bio)) > > return; > > diff --git a/block/blk-core.c b/block/blk-core.c > > index a082bbc856fb..970b23fa2e6e 100644 > > --- a/block/blk-core.c > > +++ b/block/blk-core.c > > @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > > bio->bi_opf |= REQ_TAG; > > } > > > > +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > > +{ > > + struct blk_bio_poll_data data = { > > + .bio = bio, > > + }; > > + struct blk_bio_poll_ctx *pc = ioc->data; > > + unsigned int queued; > > + > > + /* lock is required if there is more than one writer */ > > + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > > + spin_lock(&pc->lock); > > + queued = kfifo_put(&pc->sq, data); > > + spin_unlock(&pc->lock); > > + } else { > > + queued = kfifo_put(&pc->sq, data); > > + } > > + > > + /* > > + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > > + * so we can save cookie into this bio after submit_bio(). > > + */ > > + if (queued) > > + bio_set_flag(bio, BIO_END_BY_POLL); > > + else > > + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > > + > > + return queued; > > +} > > The size of kfifo is limited, and it seems that once the sq of kfifio is > full, REQ_HIPRI flag is cleared and the corresponding bio is actually > enqueued into the default hw queue, which is IRQ driven. Yeah, this patch starts with 64 queue depth, and we can increase it to 128, which should cover most of cases. > > > > + > > +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) > > +{ > > + bio->bi_iter.bi_private_data = cookie; > > +} > > + > > static noinline_for_stack bool submit_bio_checks(struct bio *bio) > > { > > struct block_device *bdev = bio->bi_bdev; > > @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) > > * bio_list_on_stack[1] contains bios that were submitted before the current > > * ->submit_bio_bio, but that haven't been processed yet. > > */ > > -static blk_qc_t __submit_bio_noacct(struct bio *bio) > > +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) > > { > > struct bio_list bio_list_on_stack[2]; > > blk_qc_t ret = BLK_QC_T_NONE; > > @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > > bio_list_on_stack[1] = bio_list_on_stack[0]; > > bio_list_init(&bio_list_on_stack[0]); > > > > - ret = __submit_bio(bio); > > + if (ioc && queue_is_mq(q) && > > + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { > > + bool queued = blk_bio_poll_prep_submit(ioc, bio); > > + > > + ret = __submit_bio(bio); > > + if (queued) > > + blk_bio_poll_post_submit(bio, ret); > > + } else { > > + ret = __submit_bio(bio); > > + } > > > > /* > > * Sort new bios into those for a lower level and those for the > > @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > > return ret; > > } > > > > +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, > > + struct io_context *ioc) > > +{ > > + struct blk_bio_poll_ctx *pc = ioc->data; > > + int entries = kfifo_len(&pc->sq); > > + > > + __submit_bio_noacct_int(bio, ioc); > > + > > + /* bio submissions queued to per-task poll context */ > > + if (kfifo_len(&pc->sq) > entries) > > + return current->pid; > > + > > + /* swapper's pid is 0, but it can't submit poll IO for us */ > > + return 0; > > +} > > + > > +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) > > +{ > > + struct io_context *ioc = current->io_context; > > + > > + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) > > + return __submit_bio_noacct_poll(bio, ioc); > > + > > + return __submit_bio_noacct_int(bio, NULL); > > +} > > + > > + > > static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) > > { > > struct bio_list bio_list[2] = { }; > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index 03f59915fe2c..4e6f1467d303 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, > > return ret; > > } > > > > +static blk_qc_t bio_get_poll_cookie(struct bio *bio) > > +{ > > + return bio->bi_iter.bi_private_data; > > +} > > + > > +static int blk_mq_poll_io(struct bio *bio) > > +{ > > + struct request_queue *q = bio->bi_bdev->bd_disk->queue; > > + blk_qc_t cookie = bio_get_poll_cookie(bio); > > + int ret = 0; > > + > > + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { > > + struct blk_mq_hw_ctx *hctx = > > + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; > > + > > + ret += blk_mq_poll_hctx(q, hctx); > > + } > > + return ret; > > +} > > + > > +static int blk_bio_poll_and_end_io(struct request_queue *q, > > + struct blk_bio_poll_ctx *poll_ctx) > > +{ > > + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; > > + int ret = 0; > > + int i; > > + > > + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { > > + struct bio *bio = poll_data[i].bio; > > + > > + if (!bio) > > + continue; > > + > > + ret += blk_mq_poll_io(bio); > > + if (bio_flagged(bio, BIO_DONE)) { > > + poll_data[i].bio = NULL; > > + > > + /* clear BIO_END_BY_POLL and end me really */ > > + bio_clear_flag(bio, BIO_END_BY_POLL); > > + bio_endio(bio); > > + } > > + } > > + return ret; > > +} > > When there are multiple threads polling, saying thread A and thread B, > then there's one bio which should be polled by thread A (the pid is > passed to thread A), while it's actually completed by thread B. In this > case, when the bio is completed by thread B, the bio is not really > completed and one extra blk_poll() still needs to be called. When this happens, the dm bio can't be completed, and the associated kiocb can't be completed too, io_uring or other poll code context will keep calling blk_poll() by passing thread A's pid until this dm bio is done, since the dm bio is submitted from thread A. Thanks, Ming -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
On 3/16/21 3:17 PM, Ming Lei wrote: > On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: >> It is a giant progress to gather all split bios that need to be polled >> in a per-task queue. Still some comments below. >> >> >> On 3/16/21 11:15 AM, Ming Lei wrote: >>> Currently bio based IO poll needs to poll all hw queue blindly, this way >>> is very inefficient, and the big reason is that we can't pass bio >>> submission result to io poll task. >>> >>> In IO submission context, store associated underlying bios into the >>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, >>> and return current->pid to caller of submit_bio() for any DM or bio based >>> driver's IO, which is submitted from FS. >>> >>> In IO poll context, the passed cookie tells us the PID of submission >>> context, and we can find the bio from that submission context. Moving >>> bio from submission queue to poll queue of the poll context, and keep >>> polling until these bios are ended. Remove bio from poll queue if the >>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. >>> >>> Usually submission shares context with io poll. The per-task poll context >>> is just like stack variable, and it is cheap to move data between the two >>> per-task queues. >>> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >>> --- >>> block/bio.c | 5 ++ >>> block/blk-core.c | 74 +++++++++++++++++- >>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- >>> include/linux/blk_types.h | 3 + >>> 4 files changed, 235 insertions(+), 3 deletions(-) >>> >>> diff --git a/block/bio.c b/block/bio.c >>> index a1c4d2900c7a..bcf5eca0e8e3 100644 >>> --- a/block/bio.c >>> +++ b/block/bio.c >>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) >>> **/ >>> void bio_endio(struct bio *bio) >>> { >>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ >>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { >>> + bio_set_flag(bio, BIO_DONE); >>> + return; >>> + } >>> again: >>> if (!bio_remaining_done(bio)) >>> return; >>> diff --git a/block/blk-core.c b/block/blk-core.c >>> index a082bbc856fb..970b23fa2e6e 100644 >>> --- a/block/blk-core.c >>> +++ b/block/blk-core.c >>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, >>> bio->bi_opf |= REQ_TAG; >>> } >>> >>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) >>> +{ >>> + struct blk_bio_poll_data data = { >>> + .bio = bio, >>> + }; >>> + struct blk_bio_poll_ctx *pc = ioc->data; >>> + unsigned int queued; >>> + >>> + /* lock is required if there is more than one writer */ >>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { >>> + spin_lock(&pc->lock); >>> + queued = kfifo_put(&pc->sq, data); >>> + spin_unlock(&pc->lock); >>> + } else { >>> + queued = kfifo_put(&pc->sq, data); >>> + } >>> + >>> + /* >>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, >>> + * so we can save cookie into this bio after submit_bio(). >>> + */ >>> + if (queued) >>> + bio_set_flag(bio, BIO_END_BY_POLL); >>> + else >>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); >>> + >>> + return queued; >>> +} >> >> The size of kfifo is limited, and it seems that once the sq of kfifio is >> full, REQ_HIPRI flag is cleared and the corresponding bio is actually >> enqueued into the default hw queue, which is IRQ driven. > > Yeah, this patch starts with 64 queue depth, and we can increase it to > 128, which should cover most of cases. It seems that the queue depth of kfifo will affect the performance as I did a fast test. Test Result: BLK_BIO_POLL_SQ_SZ | iodepth | IOPS ------------------ | ------- | ---- 64 | 128 | 301k (IRQ) -> 340k (iopoll) 64 | 16 | 304k (IRQ) -> 392k (iopoll) 128 | 128 | 204k (IRQ) -> 317k (iopoll) 256 | 128 | 241k (IRQ) -> 391k (iopoll) It seems that BLK_BIO_POLL_SQ_SZ need to be increased accordingly when iodepth is quite large. But I don't know why the performance in IRQ mode decreases when BLK_BIO_POLL_SQ_SZ is increased. Test Environment: nvme.poll_queues = 1, dmsetup create testdev --table '0 2097152 linear /dev/nvme0n1 0', ``` $cat fio.conf [global] name=iouring-sqpoll-iopoll-1 ioengine=io_uring iodepth=128 numjobs=1 thread rw=randread direct=1 #hipri=1 bs=4k runtime=10 time_based group_reporting randrepeat=0 cpus_allowed=44 [device] filename=/dev/mapper/testdev ``` > >> >> >>> + >>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) >>> +{ >>> + bio->bi_iter.bi_private_data = cookie; >>> +} >>> + >>> static noinline_for_stack bool submit_bio_checks(struct bio *bio) >>> { >>> struct block_device *bdev = bio->bi_bdev; >>> @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) >>> * bio_list_on_stack[1] contains bios that were submitted before the current >>> * ->submit_bio_bio, but that haven't been processed yet. >>> */ >>> -static blk_qc_t __submit_bio_noacct(struct bio *bio) >>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) >>> { >>> struct bio_list bio_list_on_stack[2]; >>> blk_qc_t ret = BLK_QC_T_NONE; >>> @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) >>> bio_list_on_stack[1] = bio_list_on_stack[0]; >>> bio_list_init(&bio_list_on_stack[0]); >>> >>> - ret = __submit_bio(bio); >>> + if (ioc && queue_is_mq(q) && >>> + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { >>> + bool queued = blk_bio_poll_prep_submit(ioc, bio); >>> + >>> + ret = __submit_bio(bio); >>> + if (queued) >>> + blk_bio_poll_post_submit(bio, ret); >>> + } else { >>> + ret = __submit_bio(bio); >>> + } >>> >>> /* >>> * Sort new bios into those for a lower level and those for the >>> @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) >>> return ret; >>> } >>> >>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, >>> + struct io_context *ioc) >>> +{ >>> + struct blk_bio_poll_ctx *pc = ioc->data; >>> + int entries = kfifo_len(&pc->sq); >>> + >>> + __submit_bio_noacct_int(bio, ioc); >>> + >>> + /* bio submissions queued to per-task poll context */ >>> + if (kfifo_len(&pc->sq) > entries) >>> + return current->pid; >>> + >>> + /* swapper's pid is 0, but it can't submit poll IO for us */ >>> + return 0; >>> +} >>> + >>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) >>> +{ >>> + struct io_context *ioc = current->io_context; >>> + >>> + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) >>> + return __submit_bio_noacct_poll(bio, ioc); >>> + >>> + return __submit_bio_noacct_int(bio, NULL); >>> +} >>> + >>> + >>> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) >>> { >>> struct bio_list bio_list[2] = { }; >>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>> index 03f59915fe2c..4e6f1467d303 100644 >>> --- a/block/blk-mq.c >>> +++ b/block/blk-mq.c >>> @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, >>> return ret; >>> } >>> >>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio) >>> +{ >>> + return bio->bi_iter.bi_private_data; >>> +} >>> + >>> +static int blk_mq_poll_io(struct bio *bio) >>> +{ >>> + struct request_queue *q = bio->bi_bdev->bd_disk->queue; >>> + blk_qc_t cookie = bio_get_poll_cookie(bio); >>> + int ret = 0; >>> + >>> + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { >>> + struct blk_mq_hw_ctx *hctx = >>> + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; >>> + >>> + ret += blk_mq_poll_hctx(q, hctx); >>> + } >>> + return ret; >>> +} >>> + >>> +static int blk_bio_poll_and_end_io(struct request_queue *q, >>> + struct blk_bio_poll_ctx *poll_ctx) >>> +{ >>> + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; >>> + int ret = 0; >>> + int i; >>> + >>> + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { >>> + struct bio *bio = poll_data[i].bio; >>> + >>> + if (!bio) >>> + continue; >>> + >>> + ret += blk_mq_poll_io(bio); >>> + if (bio_flagged(bio, BIO_DONE)) { >>> + poll_data[i].bio = NULL; >>> + >>> + /* clear BIO_END_BY_POLL and end me really */ >>> + bio_clear_flag(bio, BIO_END_BY_POLL); >>> + bio_endio(bio); >>> + } >>> + } >>> + return ret; >>> +} >> >> When there are multiple threads polling, saying thread A and thread B, >> then there's one bio which should be polled by thread A (the pid is >> passed to thread A), while it's actually completed by thread B. In this >> case, when the bio is completed by thread B, the bio is not really >> completed and one extra blk_poll() still needs to be called. > > When this happens, the dm bio can't be completed, and the associated > kiocb can't be completed too, io_uring or other poll code context will > keep calling blk_poll() by passing thread A's pid until this dm bio is > done, since the dm bio is submitted from thread A. > > > Thanks, > Ming >
On 3/16/21 3:17 PM, Ming Lei wrote: > On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: >> It is a giant progress to gather all split bios that need to be polled >> in a per-task queue. Still some comments below. >> >> >> On 3/16/21 11:15 AM, Ming Lei wrote: >>> Currently bio based IO poll needs to poll all hw queue blindly, this way >>> is very inefficient, and the big reason is that we can't pass bio >>> submission result to io poll task. >>> >>> In IO submission context, store associated underlying bios into the >>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, >>> and return current->pid to caller of submit_bio() for any DM or bio based >>> driver's IO, which is submitted from FS. >>> >>> In IO poll context, the passed cookie tells us the PID of submission >>> context, and we can find the bio from that submission context. Moving >>> bio from submission queue to poll queue of the poll context, and keep >>> polling until these bios are ended. Remove bio from poll queue if the >>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. >>> >>> Usually submission shares context with io poll. The per-task poll context >>> is just like stack variable, and it is cheap to move data between the two >>> per-task queues. >>> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >>> --- >>> block/bio.c | 5 ++ >>> block/blk-core.c | 74 +++++++++++++++++- >>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- >>> include/linux/blk_types.h | 3 + >>> 4 files changed, 235 insertions(+), 3 deletions(-) >>> >>> diff --git a/block/bio.c b/block/bio.c >>> index a1c4d2900c7a..bcf5eca0e8e3 100644 >>> --- a/block/bio.c >>> +++ b/block/bio.c >>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) >>> **/ >>> void bio_endio(struct bio *bio) >>> { >>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ >>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { >>> + bio_set_flag(bio, BIO_DONE); >>> + return; >>> + } >>> again: >>> if (!bio_remaining_done(bio)) >>> return; >>> diff --git a/block/blk-core.c b/block/blk-core.c >>> index a082bbc856fb..970b23fa2e6e 100644 >>> --- a/block/blk-core.c >>> +++ b/block/blk-core.c >>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, >>> bio->bi_opf |= REQ_TAG; >>> } >>> >>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) >>> +{ >>> + struct blk_bio_poll_data data = { >>> + .bio = bio, >>> + }; >>> + struct blk_bio_poll_ctx *pc = ioc->data; >>> + unsigned int queued; >>> + >>> + /* lock is required if there is more than one writer */ >>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { >>> + spin_lock(&pc->lock); >>> + queued = kfifo_put(&pc->sq, data); >>> + spin_unlock(&pc->lock); >>> + } else { >>> + queued = kfifo_put(&pc->sq, data); >>> + } >>> + >>> + /* >>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, >>> + * so we can save cookie into this bio after submit_bio(). >>> + */ >>> + if (queued) >>> + bio_set_flag(bio, BIO_END_BY_POLL); >>> + else >>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); >>> + >>> + return queued; >>> +} >> >> The size of kfifo is limited, and it seems that once the sq of kfifio is >> full, REQ_HIPRI flag is cleared and the corresponding bio is actually >> enqueued into the default hw queue, which is IRQ driven. > > Yeah, this patch starts with 64 queue depth, and we can increase it to > 128, which should cover most of cases. > >> >> >>> + >>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) >>> +{ >>> + bio->bi_iter.bi_private_data = cookie; >>> +} >>> + >>> static noinline_for_stack bool submit_bio_checks(struct bio *bio) >>> { >>> struct block_device *bdev = bio->bi_bdev; >>> @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) >>> * bio_list_on_stack[1] contains bios that were submitted before the current >>> * ->submit_bio_bio, but that haven't been processed yet. >>> */ >>> -static blk_qc_t __submit_bio_noacct(struct bio *bio) >>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) >>> { >>> struct bio_list bio_list_on_stack[2]; >>> blk_qc_t ret = BLK_QC_T_NONE; >>> @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) >>> bio_list_on_stack[1] = bio_list_on_stack[0]; >>> bio_list_init(&bio_list_on_stack[0]); >>> >>> - ret = __submit_bio(bio); >>> + if (ioc && queue_is_mq(q) && >>> + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { >>> + bool queued = blk_bio_poll_prep_submit(ioc, bio); >>> + >>> + ret = __submit_bio(bio); >>> + if (queued) >>> + blk_bio_poll_post_submit(bio, ret); >>> + } else { >>> + ret = __submit_bio(bio); >>> + } >>> >>> /* >>> * Sort new bios into those for a lower level and those for the >>> @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) >>> return ret; >>> } >>> >>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, >>> + struct io_context *ioc) >>> +{ >>> + struct blk_bio_poll_ctx *pc = ioc->data; >>> + int entries = kfifo_len(&pc->sq); >>> + >>> + __submit_bio_noacct_int(bio, ioc); >>> + >>> + /* bio submissions queued to per-task poll context */ >>> + if (kfifo_len(&pc->sq) > entries) >>> + return current->pid; >>> + >>> + /* swapper's pid is 0, but it can't submit poll IO for us */ >>> + return 0; >>> +} >>> + >>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) >>> +{ >>> + struct io_context *ioc = current->io_context; >>> + >>> + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) >>> + return __submit_bio_noacct_poll(bio, ioc); >>> + >>> + return __submit_bio_noacct_int(bio, NULL); >>> +} >>> + >>> + >>> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) >>> { >>> struct bio_list bio_list[2] = { }; >>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>> index 03f59915fe2c..4e6f1467d303 100644 >>> --- a/block/blk-mq.c >>> +++ b/block/blk-mq.c >>> @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, >>> return ret; >>> } >>> >>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio) >>> +{ >>> + return bio->bi_iter.bi_private_data; >>> +} >>> + >>> +static int blk_mq_poll_io(struct bio *bio) >>> +{ >>> + struct request_queue *q = bio->bi_bdev->bd_disk->queue; >>> + blk_qc_t cookie = bio_get_poll_cookie(bio); >>> + int ret = 0; >>> + >>> + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { >>> + struct blk_mq_hw_ctx *hctx = >>> + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; >>> + >>> + ret += blk_mq_poll_hctx(q, hctx); >>> + } >>> + return ret; >>> +} >>> + >>> +static int blk_bio_poll_and_end_io(struct request_queue *q, >>> + struct blk_bio_poll_ctx *poll_ctx) >>> +{ >>> + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; >>> + int ret = 0; >>> + int i; >>> + >>> + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { >>> + struct bio *bio = poll_data[i].bio; >>> + >>> + if (!bio) >>> + continue; >>> + >>> + ret += blk_mq_poll_io(bio); >>> + if (bio_flagged(bio, BIO_DONE)) { >>> + poll_data[i].bio = NULL; >>> + >>> + /* clear BIO_END_BY_POLL and end me really */ >>> + bio_clear_flag(bio, BIO_END_BY_POLL); >>> + bio_endio(bio); >>> + } >>> + } >>> + return ret; >>> +} >> >> When there are multiple threads polling, saying thread A and thread B, >> then there's one bio which should be polled by thread A (the pid is >> passed to thread A), while it's actually completed by thread B. In this >> case, when the bio is completed by thread B, the bio is not really >> completed and one extra blk_poll() still needs to be called. > > When this happens, the dm bio can't be completed, and the associated > kiocb can't be completed too, io_uring or other poll code context will > keep calling blk_poll() by passing thread A's pid until this dm bio is > done, since the dm bio is submitted from thread A. > This will affect the multi-thread polling performance. I tested dm-stripe, in which every bio will be split and enqueued into all underlying devices, and thus amplify the interference between multiple threads. Test Result: IOPS: 332k (IRQ) -> 363k (iopoll), aka ~10% performance gain Test Environment: nvme.poll_queues = 3 BLK_BIO_POLL_SQ_SZ = 128 dmsetup create testdev --table "0 629145600 striped 3 8 /dev/nvme0n1 0 /dev/nvme1n1 0 /dev/nvme4n1 0" ``` $cat fio.conf [global] name=iouring-sqpoll-iopoll-1 ioengine=io_uring iodepth=128 numjobs=1 thread rw=randread direct=1 hipri=1 runtime=10 time_based group_reporting randrepeat=0 filename=/dev/mapper/testdev bs=12k [job-1] cpus_allowed=14 [job-2] cpus_allowed=16 [job-3] cpus_allowed=84 ```
On Tue, Mar 16, 2021 at 04:52:36PM +0800, JeffleXu wrote: > > > On 3/16/21 3:17 PM, Ming Lei wrote: > > On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: > >> It is a giant progress to gather all split bios that need to be polled > >> in a per-task queue. Still some comments below. > >> > >> > >> On 3/16/21 11:15 AM, Ming Lei wrote: > >>> Currently bio based IO poll needs to poll all hw queue blindly, this way > >>> is very inefficient, and the big reason is that we can't pass bio > >>> submission result to io poll task. > >>> > >>> In IO submission context, store associated underlying bios into the > >>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > >>> and return current->pid to caller of submit_bio() for any DM or bio based > >>> driver's IO, which is submitted from FS. > >>> > >>> In IO poll context, the passed cookie tells us the PID of submission > >>> context, and we can find the bio from that submission context. Moving > >>> bio from submission queue to poll queue of the poll context, and keep > >>> polling until these bios are ended. Remove bio from poll queue if the > >>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > >>> > >>> Usually submission shares context with io poll. The per-task poll context > >>> is just like stack variable, and it is cheap to move data between the two > >>> per-task queues. > >>> > >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > >>> --- > >>> block/bio.c | 5 ++ > >>> block/blk-core.c | 74 +++++++++++++++++- > >>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > >>> include/linux/blk_types.h | 3 + > >>> 4 files changed, 235 insertions(+), 3 deletions(-) > >>> > >>> diff --git a/block/bio.c b/block/bio.c > >>> index a1c4d2900c7a..bcf5eca0e8e3 100644 > >>> --- a/block/bio.c > >>> +++ b/block/bio.c > >>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > >>> **/ > >>> void bio_endio(struct bio *bio) > >>> { > >>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > >>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { > >>> + bio_set_flag(bio, BIO_DONE); > >>> + return; > >>> + } > >>> again: > >>> if (!bio_remaining_done(bio)) > >>> return; > >>> diff --git a/block/blk-core.c b/block/blk-core.c > >>> index a082bbc856fb..970b23fa2e6e 100644 > >>> --- a/block/blk-core.c > >>> +++ b/block/blk-core.c > >>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > >>> bio->bi_opf |= REQ_TAG; > >>> } > >>> > >>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > >>> +{ > >>> + struct blk_bio_poll_data data = { > >>> + .bio = bio, > >>> + }; > >>> + struct blk_bio_poll_ctx *pc = ioc->data; > >>> + unsigned int queued; > >>> + > >>> + /* lock is required if there is more than one writer */ > >>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > >>> + spin_lock(&pc->lock); > >>> + queued = kfifo_put(&pc->sq, data); > >>> + spin_unlock(&pc->lock); > >>> + } else { > >>> + queued = kfifo_put(&pc->sq, data); > >>> + } > >>> + > >>> + /* > >>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > >>> + * so we can save cookie into this bio after submit_bio(). > >>> + */ > >>> + if (queued) > >>> + bio_set_flag(bio, BIO_END_BY_POLL); > >>> + else > >>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > >>> + > >>> + return queued; > >>> +} > >> > >> The size of kfifo is limited, and it seems that once the sq of kfifio is > >> full, REQ_HIPRI flag is cleared and the corresponding bio is actually > >> enqueued into the default hw queue, which is IRQ driven. > > > > Yeah, this patch starts with 64 queue depth, and we can increase it to > > 128, which should cover most of cases. > > It seems that the queue depth of kfifo will affect the performance as I > did a fast test. > > > > Test Result: > > BLK_BIO_POLL_SQ_SZ | iodepth | IOPS > ------------------ | ------- | ---- > 64 | 128 | 301k (IRQ) -> 340k (iopoll) > 64 | 16 | 304k (IRQ) -> 392k (iopoll) > 128 | 128 | 204k (IRQ) -> 317k (iopoll) > 256 | 128 | 241k (IRQ) -> 391k (iopoll) > > It seems that BLK_BIO_POLL_SQ_SZ need to be increased accordingly when > iodepth is quite large. But I don't know why the performance in IRQ mode > decreases when BLK_BIO_POLL_SQ_SZ is increased. This patchset is supposed to not affect IRQ mode because HIPRI isn't set at IRQ mode. Or you mean '--hipri' & io_uring is setup but setting nvme.poll_queues as 0 at your 'IRQ' mode test? Thanks for starting to run performance test, and so far I just run test in KVM, not start performance test yet. thanks, Ming -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
On Tue, Mar 16, 2021 at 07:00:49PM +0800, JeffleXu wrote: > > > On 3/16/21 3:17 PM, Ming Lei wrote: > > On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: > >> It is a giant progress to gather all split bios that need to be polled > >> in a per-task queue. Still some comments below. > >> > >> > >> On 3/16/21 11:15 AM, Ming Lei wrote: > >>> Currently bio based IO poll needs to poll all hw queue blindly, this way > >>> is very inefficient, and the big reason is that we can't pass bio > >>> submission result to io poll task. > >>> > >>> In IO submission context, store associated underlying bios into the > >>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > >>> and return current->pid to caller of submit_bio() for any DM or bio based > >>> driver's IO, which is submitted from FS. > >>> > >>> In IO poll context, the passed cookie tells us the PID of submission > >>> context, and we can find the bio from that submission context. Moving > >>> bio from submission queue to poll queue of the poll context, and keep > >>> polling until these bios are ended. Remove bio from poll queue if the > >>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > >>> > >>> Usually submission shares context with io poll. The per-task poll context > >>> is just like stack variable, and it is cheap to move data between the two > >>> per-task queues. > >>> > >>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > >>> --- > >>> block/bio.c | 5 ++ > >>> block/blk-core.c | 74 +++++++++++++++++- > >>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > >>> include/linux/blk_types.h | 3 + > >>> 4 files changed, 235 insertions(+), 3 deletions(-) > >>> > >>> diff --git a/block/bio.c b/block/bio.c > >>> index a1c4d2900c7a..bcf5eca0e8e3 100644 > >>> --- a/block/bio.c > >>> +++ b/block/bio.c > >>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > >>> **/ > >>> void bio_endio(struct bio *bio) > >>> { > >>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > >>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { > >>> + bio_set_flag(bio, BIO_DONE); > >>> + return; > >>> + } > >>> again: > >>> if (!bio_remaining_done(bio)) > >>> return; > >>> diff --git a/block/blk-core.c b/block/blk-core.c > >>> index a082bbc856fb..970b23fa2e6e 100644 > >>> --- a/block/blk-core.c > >>> +++ b/block/blk-core.c > >>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > >>> bio->bi_opf |= REQ_TAG; > >>> } > >>> > >>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > >>> +{ > >>> + struct blk_bio_poll_data data = { > >>> + .bio = bio, > >>> + }; > >>> + struct blk_bio_poll_ctx *pc = ioc->data; > >>> + unsigned int queued; > >>> + > >>> + /* lock is required if there is more than one writer */ > >>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > >>> + spin_lock(&pc->lock); > >>> + queued = kfifo_put(&pc->sq, data); > >>> + spin_unlock(&pc->lock); > >>> + } else { > >>> + queued = kfifo_put(&pc->sq, data); > >>> + } > >>> + > >>> + /* > >>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > >>> + * so we can save cookie into this bio after submit_bio(). > >>> + */ > >>> + if (queued) > >>> + bio_set_flag(bio, BIO_END_BY_POLL); > >>> + else > >>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > >>> + > >>> + return queued; > >>> +} > >> > >> The size of kfifo is limited, and it seems that once the sq of kfifio is > >> full, REQ_HIPRI flag is cleared and the corresponding bio is actually > >> enqueued into the default hw queue, which is IRQ driven. > > > > Yeah, this patch starts with 64 queue depth, and we can increase it to > > 128, which should cover most of cases. > > > >> > >> > >>> + > >>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) > >>> +{ > >>> + bio->bi_iter.bi_private_data = cookie; > >>> +} > >>> + > >>> static noinline_for_stack bool submit_bio_checks(struct bio *bio) > >>> { > >>> struct block_device *bdev = bio->bi_bdev; > >>> @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) > >>> * bio_list_on_stack[1] contains bios that were submitted before the current > >>> * ->submit_bio_bio, but that haven't been processed yet. > >>> */ > >>> -static blk_qc_t __submit_bio_noacct(struct bio *bio) > >>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) > >>> { > >>> struct bio_list bio_list_on_stack[2]; > >>> blk_qc_t ret = BLK_QC_T_NONE; > >>> @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > >>> bio_list_on_stack[1] = bio_list_on_stack[0]; > >>> bio_list_init(&bio_list_on_stack[0]); > >>> > >>> - ret = __submit_bio(bio); > >>> + if (ioc && queue_is_mq(q) && > >>> + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { > >>> + bool queued = blk_bio_poll_prep_submit(ioc, bio); > >>> + > >>> + ret = __submit_bio(bio); > >>> + if (queued) > >>> + blk_bio_poll_post_submit(bio, ret); > >>> + } else { > >>> + ret = __submit_bio(bio); > >>> + } > >>> > >>> /* > >>> * Sort new bios into those for a lower level and those for the > >>> @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > >>> return ret; > >>> } > >>> > >>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, > >>> + struct io_context *ioc) > >>> +{ > >>> + struct blk_bio_poll_ctx *pc = ioc->data; > >>> + int entries = kfifo_len(&pc->sq); > >>> + > >>> + __submit_bio_noacct_int(bio, ioc); > >>> + > >>> + /* bio submissions queued to per-task poll context */ > >>> + if (kfifo_len(&pc->sq) > entries) > >>> + return current->pid; > >>> + > >>> + /* swapper's pid is 0, but it can't submit poll IO for us */ > >>> + return 0; > >>> +} > >>> + > >>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) > >>> +{ > >>> + struct io_context *ioc = current->io_context; > >>> + > >>> + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) > >>> + return __submit_bio_noacct_poll(bio, ioc); > >>> + > >>> + return __submit_bio_noacct_int(bio, NULL); > >>> +} > >>> + > >>> + > >>> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) > >>> { > >>> struct bio_list bio_list[2] = { }; > >>> diff --git a/block/blk-mq.c b/block/blk-mq.c > >>> index 03f59915fe2c..4e6f1467d303 100644 > >>> --- a/block/blk-mq.c > >>> +++ b/block/blk-mq.c > >>> @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, > >>> return ret; > >>> } > >>> > >>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio) > >>> +{ > >>> + return bio->bi_iter.bi_private_data; > >>> +} > >>> + > >>> +static int blk_mq_poll_io(struct bio *bio) > >>> +{ > >>> + struct request_queue *q = bio->bi_bdev->bd_disk->queue; > >>> + blk_qc_t cookie = bio_get_poll_cookie(bio); > >>> + int ret = 0; > >>> + > >>> + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { > >>> + struct blk_mq_hw_ctx *hctx = > >>> + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; > >>> + > >>> + ret += blk_mq_poll_hctx(q, hctx); > >>> + } > >>> + return ret; > >>> +} > >>> + > >>> +static int blk_bio_poll_and_end_io(struct request_queue *q, > >>> + struct blk_bio_poll_ctx *poll_ctx) > >>> +{ > >>> + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; > >>> + int ret = 0; > >>> + int i; > >>> + > >>> + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { > >>> + struct bio *bio = poll_data[i].bio; > >>> + > >>> + if (!bio) > >>> + continue; > >>> + > >>> + ret += blk_mq_poll_io(bio); > >>> + if (bio_flagged(bio, BIO_DONE)) { > >>> + poll_data[i].bio = NULL; > >>> + > >>> + /* clear BIO_END_BY_POLL and end me really */ > >>> + bio_clear_flag(bio, BIO_END_BY_POLL); > >>> + bio_endio(bio); > >>> + } > >>> + } > >>> + return ret; > >>> +} > >> > >> When there are multiple threads polling, saying thread A and thread B, > >> then there's one bio which should be polled by thread A (the pid is > >> passed to thread A), while it's actually completed by thread B. In this > >> case, when the bio is completed by thread B, the bio is not really > >> completed and one extra blk_poll() still needs to be called. > > > > When this happens, the dm bio can't be completed, and the associated > > kiocb can't be completed too, io_uring or other poll code context will > > keep calling blk_poll() by passing thread A's pid until this dm bio is > > done, since the dm bio is submitted from thread A. > > > > This will affect the multi-thread polling performance. I tested > dm-stripe, in which every bio will be split and enqueued into all > underlying devices, and thus amplify the interference between multiple > threads. > > Test Result: > IOPS: 332k (IRQ) -> 363k (iopoll), aka ~10% performance gain > > > Test Environment: > > nvme.poll_queues = 3 > > BLK_BIO_POLL_SQ_SZ = 128 > > dmsetup create testdev --table "0 629145600 striped 3 8 /dev/nvme0n1 0 > /dev/nvme1n1 0 /dev/nvme4n1 0" > > > ``` > $cat fio.conf > [global] > name=iouring-sqpoll-iopoll-1 > ioengine=io_uring > iodepth=128 > numjobs=1 If numjobs is 1, there can't be the queue interference issue because there is only one submission job. If numjobs is 3 and nvme.poll_queues is 3, there are two cases: 1) each job is assigned with different hw queue, so no queue interference 2) two jobs share one same hw queue, and there is queue interference The behavior is decided by scheduler since the queue mapping between cpu vs. poll queue is fixed. You can compare the above two cases by passing different 'cpus_allowed' to each job in fio. > thread > rw=randread > direct=1 > hipri=1 > runtime=10 > time_based > group_reporting > randrepeat=0 > filename=/dev/mapper/testdev > bs=12k > > [job-1] > cpus_allowed=14 > > [job-2] > cpus_allowed=16 > > [job-3] > cpus_allowed=84 It depends if cpu of 14,16,84 is mapped to same hw poll queue or not. If all 3 cpus are mapped to same hw poll queue, there will be lock contention in submission path, see nvme_submit_cmd(), and the hw queue data is concurrent polled in 3 cpus too. Thanks, Ming -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
On 3/16/21 7:00 PM, JeffleXu wrote: > > > On 3/16/21 3:17 PM, Ming Lei wrote: >> On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: >>> It is a giant progress to gather all split bios that need to be polled >>> in a per-task queue. Still some comments below. >>> >>> >>> On 3/16/21 11:15 AM, Ming Lei wrote: >>>> Currently bio based IO poll needs to poll all hw queue blindly, this way >>>> is very inefficient, and the big reason is that we can't pass bio >>>> submission result to io poll task. >>>> >>>> In IO submission context, store associated underlying bios into the >>>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, >>>> and return current->pid to caller of submit_bio() for any DM or bio based >>>> driver's IO, which is submitted from FS. >>>> >>>> In IO poll context, the passed cookie tells us the PID of submission >>>> context, and we can find the bio from that submission context. Moving >>>> bio from submission queue to poll queue of the poll context, and keep >>>> polling until these bios are ended. Remove bio from poll queue if the >>>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. >>>> >>>> Usually submission shares context with io poll. The per-task poll context >>>> is just like stack variable, and it is cheap to move data between the two >>>> per-task queues. >>>> >>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >>>> --- >>>> block/bio.c | 5 ++ >>>> block/blk-core.c | 74 +++++++++++++++++- >>>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- >>>> include/linux/blk_types.h | 3 + >>>> 4 files changed, 235 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/block/bio.c b/block/bio.c >>>> index a1c4d2900c7a..bcf5eca0e8e3 100644 >>>> --- a/block/bio.c >>>> +++ b/block/bio.c >>>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) >>>> **/ >>>> void bio_endio(struct bio *bio) >>>> { >>>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ >>>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { >>>> + bio_set_flag(bio, BIO_DONE); >>>> + return; >>>> + } >>>> again: >>>> if (!bio_remaining_done(bio)) >>>> return; >>>> diff --git a/block/blk-core.c b/block/blk-core.c >>>> index a082bbc856fb..970b23fa2e6e 100644 >>>> --- a/block/blk-core.c >>>> +++ b/block/blk-core.c >>>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, >>>> bio->bi_opf |= REQ_TAG; >>>> } >>>> >>>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) >>>> +{ >>>> + struct blk_bio_poll_data data = { >>>> + .bio = bio, >>>> + }; >>>> + struct blk_bio_poll_ctx *pc = ioc->data; >>>> + unsigned int queued; >>>> + >>>> + /* lock is required if there is more than one writer */ >>>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { >>>> + spin_lock(&pc->lock); >>>> + queued = kfifo_put(&pc->sq, data); >>>> + spin_unlock(&pc->lock); >>>> + } else { >>>> + queued = kfifo_put(&pc->sq, data); >>>> + } >>>> + >>>> + /* >>>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, >>>> + * so we can save cookie into this bio after submit_bio(). >>>> + */ >>>> + if (queued) >>>> + bio_set_flag(bio, BIO_END_BY_POLL); >>>> + else >>>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); >>>> + >>>> + return queued; >>>> +} >>> >>> The size of kfifo is limited, and it seems that once the sq of kfifio is >>> full, REQ_HIPRI flag is cleared and the corresponding bio is actually >>> enqueued into the default hw queue, which is IRQ driven. >> >> Yeah, this patch starts with 64 queue depth, and we can increase it to >> 128, which should cover most of cases. >> >>> >>> >>>> + >>>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) >>>> +{ >>>> + bio->bi_iter.bi_private_data = cookie; >>>> +} >>>> + >>>> static noinline_for_stack bool submit_bio_checks(struct bio *bio) >>>> { >>>> struct block_device *bdev = bio->bi_bdev; >>>> @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) >>>> * bio_list_on_stack[1] contains bios that were submitted before the current >>>> * ->submit_bio_bio, but that haven't been processed yet. >>>> */ >>>> -static blk_qc_t __submit_bio_noacct(struct bio *bio) >>>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) >>>> { >>>> struct bio_list bio_list_on_stack[2]; >>>> blk_qc_t ret = BLK_QC_T_NONE; >>>> @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) >>>> bio_list_on_stack[1] = bio_list_on_stack[0]; >>>> bio_list_init(&bio_list_on_stack[0]); >>>> >>>> - ret = __submit_bio(bio); >>>> + if (ioc && queue_is_mq(q) && >>>> + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { >>>> + bool queued = blk_bio_poll_prep_submit(ioc, bio); >>>> + >>>> + ret = __submit_bio(bio); >>>> + if (queued) >>>> + blk_bio_poll_post_submit(bio, ret); >>>> + } else { >>>> + ret = __submit_bio(bio); >>>> + } >>>> >>>> /* >>>> * Sort new bios into those for a lower level and those for the >>>> @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) >>>> return ret; >>>> } >>>> >>>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, >>>> + struct io_context *ioc) >>>> +{ >>>> + struct blk_bio_poll_ctx *pc = ioc->data; >>>> + int entries = kfifo_len(&pc->sq); >>>> + >>>> + __submit_bio_noacct_int(bio, ioc); >>>> + >>>> + /* bio submissions queued to per-task poll context */ >>>> + if (kfifo_len(&pc->sq) > entries) >>>> + return current->pid; >>>> + >>>> + /* swapper's pid is 0, but it can't submit poll IO for us */ >>>> + return 0; >>>> +} >>>> + >>>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) >>>> +{ >>>> + struct io_context *ioc = current->io_context; >>>> + >>>> + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) >>>> + return __submit_bio_noacct_poll(bio, ioc); >>>> + >>>> + return __submit_bio_noacct_int(bio, NULL); >>>> +} >>>> + >>>> + >>>> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) >>>> { >>>> struct bio_list bio_list[2] = { }; >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>> index 03f59915fe2c..4e6f1467d303 100644 >>>> --- a/block/blk-mq.c >>>> +++ b/block/blk-mq.c >>>> @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, >>>> return ret; >>>> } >>>> >>>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio) >>>> +{ >>>> + return bio->bi_iter.bi_private_data; >>>> +} >>>> + >>>> +static int blk_mq_poll_io(struct bio *bio) >>>> +{ >>>> + struct request_queue *q = bio->bi_bdev->bd_disk->queue; >>>> + blk_qc_t cookie = bio_get_poll_cookie(bio); >>>> + int ret = 0; >>>> + >>>> + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { >>>> + struct blk_mq_hw_ctx *hctx = >>>> + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; >>>> + >>>> + ret += blk_mq_poll_hctx(q, hctx); >>>> + } >>>> + return ret; >>>> +} >>>> + >>>> +static int blk_bio_poll_and_end_io(struct request_queue *q, >>>> + struct blk_bio_poll_ctx *poll_ctx) >>>> +{ >>>> + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; >>>> + int ret = 0; >>>> + int i; >>>> + >>>> + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { >>>> + struct bio *bio = poll_data[i].bio; >>>> + >>>> + if (!bio) >>>> + continue; >>>> + >>>> + ret += blk_mq_poll_io(bio); >>>> + if (bio_flagged(bio, BIO_DONE)) { >>>> + poll_data[i].bio = NULL; >>>> + >>>> + /* clear BIO_END_BY_POLL and end me really */ >>>> + bio_clear_flag(bio, BIO_END_BY_POLL); >>>> + bio_endio(bio); >>>> + } >>>> + } >>>> + return ret; >>>> +} >>> >>> When there are multiple threads polling, saying thread A and thread B, >>> then there's one bio which should be polled by thread A (the pid is >>> passed to thread A), while it's actually completed by thread B. In this >>> case, when the bio is completed by thread B, the bio is not really >>> completed and one extra blk_poll() still needs to be called. >> >> When this happens, the dm bio can't be completed, and the associated >> kiocb can't be completed too, io_uring or other poll code context will >> keep calling blk_poll() by passing thread A's pid until this dm bio is >> done, since the dm bio is submitted from thread A. >> > > This will affect the multi-thread polling performance. I tested > dm-stripe, in which every bio will be split and enqueued into all > underlying devices, and thus amplify the interference between multiple > threads. > > Test Result: > IOPS: 332k (IRQ) -> 363k (iopoll), aka ~10% performance gain Sorry this performance drop is not related to this bio refcount issue here. Still it's due to the limited kfifo size. I did another through test on another machine (aarch64 with more nvme disks). - Without mentioned specifically, the configuration is 'iodepth=128, kfifo queue depth =128'. - The number before '->' indicates the IOPS in IRQ mode, i.e., 'hipri=0', while the number after '->' indicates the IOPS in polling mode, i.e., 'hipri=1'. ``` 3-threads dm-linear-3 targets (4k randread IOPS, unit K) 5.12-rc1: 667 leiming: 674 -> 849 ours 8353c1a: 623 -> 811 3-threads dm-stripe-3 targets (12k randread IOPS, unit K) 5.12-rc1: 321 leiming: 313 -> 349 leiming : 313 -> 409 (iodepth=32, kfifo queue depth =128) leiming : 314 -> 409 (iodepth=128, kfifo queue depth =512) ours 8353c1a: 310 -> 406 1-thread dm-linear-3 targets (4k randread IOPS, unit K) 5.12-rc1: 224 leiming: 218 -> 288 ours 8353c1a: 210 -> 280 1-threads dm-stripe-3 targets (12k randread IOPS, unit K) 5.12-rc1: 109 leiming: 107 -> 120 leiming : 107 -> 145 (iodepth=32, kfifo queue depth =128) leiming : 108 -> 145 (iodepth=128, kfifo queue depth =512) ours 8353c1a: 107 -> 146 ``` Some hints: 1. When configured as 'iodepth=128, kfifo queue depth =128', dm-stripe doesn't perform well in polling mode. It's because it's more likely that the original bio will be split into split bios in dm-stripe, and thus kfifo will be more likely used up in this case. So the size of kfifo need to be tuned according to iodepth and the IO load. Thus exporting the size of kfifo as a sysfs entry may be need in the following patch. 2. It indicates a performance drop of my patch in IRQ mode, compared to the original 5.12-rc1. I doubt maybe it's due to extra code mixed in blk-core, such as __submit_bio_noacct()... > > > Test Environment: > > nvme.poll_queues = 3 > > BLK_BIO_POLL_SQ_SZ = 128 > > dmsetup create testdev --table "0 629145600 striped 3 8 /dev/nvme0n1 0 > /dev/nvme1n1 0 /dev/nvme4n1 0" > > > ``` > $cat fio.conf > [global] > name=iouring-sqpoll-iopoll-1 > ioengine=io_uring > iodepth=128 > numjobs=1 > thread > rw=randread > direct=1 > hipri=1 > runtime=10 > time_based > group_reporting > randrepeat=0 > filename=/dev/mapper/testdev > bs=12k > > [job-1] > cpus_allowed=14 > > [job-2] > cpus_allowed=16 > > [job-3] > cpus_allowed=84 > ``` >
On 3/17/21 10:54 AM, Ming Lei wrote: > On Tue, Mar 16, 2021 at 04:52:36PM +0800, JeffleXu wrote: >> >> >> On 3/16/21 3:17 PM, Ming Lei wrote: >>> On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: >>>> It is a giant progress to gather all split bios that need to be polled >>>> in a per-task queue. Still some comments below. >>>> >>>> >>>> On 3/16/21 11:15 AM, Ming Lei wrote: >>>>> Currently bio based IO poll needs to poll all hw queue blindly, this way >>>>> is very inefficient, and the big reason is that we can't pass bio >>>>> submission result to io poll task. >>>>> >>>>> In IO submission context, store associated underlying bios into the >>>>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, >>>>> and return current->pid to caller of submit_bio() for any DM or bio based >>>>> driver's IO, which is submitted from FS. >>>>> >>>>> In IO poll context, the passed cookie tells us the PID of submission >>>>> context, and we can find the bio from that submission context. Moving >>>>> bio from submission queue to poll queue of the poll context, and keep >>>>> polling until these bios are ended. Remove bio from poll queue if the >>>>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. >>>>> >>>>> Usually submission shares context with io poll. The per-task poll context >>>>> is just like stack variable, and it is cheap to move data between the two >>>>> per-task queues. >>>>> >>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >>>>> --- >>>>> block/bio.c | 5 ++ >>>>> block/blk-core.c | 74 +++++++++++++++++- >>>>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- >>>>> include/linux/blk_types.h | 3 + >>>>> 4 files changed, 235 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/block/bio.c b/block/bio.c >>>>> index a1c4d2900c7a..bcf5eca0e8e3 100644 >>>>> --- a/block/bio.c >>>>> +++ b/block/bio.c >>>>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) >>>>> **/ >>>>> void bio_endio(struct bio *bio) >>>>> { >>>>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ >>>>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { >>>>> + bio_set_flag(bio, BIO_DONE); >>>>> + return; >>>>> + } >>>>> again: >>>>> if (!bio_remaining_done(bio)) >>>>> return; >>>>> diff --git a/block/blk-core.c b/block/blk-core.c >>>>> index a082bbc856fb..970b23fa2e6e 100644 >>>>> --- a/block/blk-core.c >>>>> +++ b/block/blk-core.c >>>>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, >>>>> bio->bi_opf |= REQ_TAG; >>>>> } >>>>> >>>>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) >>>>> +{ >>>>> + struct blk_bio_poll_data data = { >>>>> + .bio = bio, >>>>> + }; >>>>> + struct blk_bio_poll_ctx *pc = ioc->data; >>>>> + unsigned int queued; >>>>> + >>>>> + /* lock is required if there is more than one writer */ >>>>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { >>>>> + spin_lock(&pc->lock); >>>>> + queued = kfifo_put(&pc->sq, data); >>>>> + spin_unlock(&pc->lock); >>>>> + } else { >>>>> + queued = kfifo_put(&pc->sq, data); >>>>> + } >>>>> + >>>>> + /* >>>>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, >>>>> + * so we can save cookie into this bio after submit_bio(). >>>>> + */ >>>>> + if (queued) >>>>> + bio_set_flag(bio, BIO_END_BY_POLL); >>>>> + else >>>>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); >>>>> + >>>>> + return queued; >>>>> +} >>>> >>>> The size of kfifo is limited, and it seems that once the sq of kfifio is >>>> full, REQ_HIPRI flag is cleared and the corresponding bio is actually >>>> enqueued into the default hw queue, which is IRQ driven. >>> >>> Yeah, this patch starts with 64 queue depth, and we can increase it to >>> 128, which should cover most of cases. >> >> It seems that the queue depth of kfifo will affect the performance as I >> did a fast test. >> >> >> >> Test Result: >> >> BLK_BIO_POLL_SQ_SZ | iodepth | IOPS >> ------------------ | ------- | ---- >> 64 | 128 | 301k (IRQ) -> 340k (iopoll) >> 64 | 16 | 304k (IRQ) -> 392k (iopoll) >> 128 | 128 | 204k (IRQ) -> 317k (iopoll) >> 256 | 128 | 241k (IRQ) -> 391k (iopoll) >> >> It seems that BLK_BIO_POLL_SQ_SZ need to be increased accordingly when >> iodepth is quite large. But I don't know why the performance in IRQ mode >> decreases when BLK_BIO_POLL_SQ_SZ is increased. > > This patchset is supposed to not affect IRQ mode because HIPRI isn't set > at IRQ mode. Or you mean '--hipri' & io_uring is setup but setting > nvme.poll_queues as 0 at your 'IRQ' mode test? > > Thanks for starting to run performance test, and so far I just run test > in KVM, not start performance test yet. > 'IRQ' means 'hipri=0' of fio configuration. The above performance test was performed on one x86 machine with one single nvme disk. I did the test on another aarch64 machine with more nvme disks, showing that this performance drop didn't occure... Please see my reply in another thread for detailed test results.
On Wed, Mar 17, 2021 at 11:53:12AM +0800, JeffleXu wrote: > > > On 3/17/21 10:54 AM, Ming Lei wrote: > > On Tue, Mar 16, 2021 at 04:52:36PM +0800, JeffleXu wrote: > >> > >> > >> On 3/16/21 3:17 PM, Ming Lei wrote: > >>> On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: > >>>> It is a giant progress to gather all split bios that need to be polled > >>>> in a per-task queue. Still some comments below. > >>>> > >>>> > >>>> On 3/16/21 11:15 AM, Ming Lei wrote: > >>>>> Currently bio based IO poll needs to poll all hw queue blindly, this way > >>>>> is very inefficient, and the big reason is that we can't pass bio > >>>>> submission result to io poll task. > >>>>> > >>>>> In IO submission context, store associated underlying bios into the > >>>>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > >>>>> and return current->pid to caller of submit_bio() for any DM or bio based > >>>>> driver's IO, which is submitted from FS. > >>>>> > >>>>> In IO poll context, the passed cookie tells us the PID of submission > >>>>> context, and we can find the bio from that submission context. Moving > >>>>> bio from submission queue to poll queue of the poll context, and keep > >>>>> polling until these bios are ended. Remove bio from poll queue if the > >>>>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > >>>>> > >>>>> Usually submission shares context with io poll. The per-task poll context > >>>>> is just like stack variable, and it is cheap to move data between the two > >>>>> per-task queues. > >>>>> > >>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > >>>>> --- > >>>>> block/bio.c | 5 ++ > >>>>> block/blk-core.c | 74 +++++++++++++++++- > >>>>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > >>>>> include/linux/blk_types.h | 3 + > >>>>> 4 files changed, 235 insertions(+), 3 deletions(-) > >>>>> > >>>>> diff --git a/block/bio.c b/block/bio.c > >>>>> index a1c4d2900c7a..bcf5eca0e8e3 100644 > >>>>> --- a/block/bio.c > >>>>> +++ b/block/bio.c > >>>>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > >>>>> **/ > >>>>> void bio_endio(struct bio *bio) > >>>>> { > >>>>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > >>>>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { > >>>>> + bio_set_flag(bio, BIO_DONE); > >>>>> + return; > >>>>> + } > >>>>> again: > >>>>> if (!bio_remaining_done(bio)) > >>>>> return; > >>>>> diff --git a/block/blk-core.c b/block/blk-core.c > >>>>> index a082bbc856fb..970b23fa2e6e 100644 > >>>>> --- a/block/blk-core.c > >>>>> +++ b/block/blk-core.c > >>>>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > >>>>> bio->bi_opf |= REQ_TAG; > >>>>> } > >>>>> > >>>>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > >>>>> +{ > >>>>> + struct blk_bio_poll_data data = { > >>>>> + .bio = bio, > >>>>> + }; > >>>>> + struct blk_bio_poll_ctx *pc = ioc->data; > >>>>> + unsigned int queued; > >>>>> + > >>>>> + /* lock is required if there is more than one writer */ > >>>>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > >>>>> + spin_lock(&pc->lock); > >>>>> + queued = kfifo_put(&pc->sq, data); > >>>>> + spin_unlock(&pc->lock); > >>>>> + } else { > >>>>> + queued = kfifo_put(&pc->sq, data); > >>>>> + } > >>>>> + > >>>>> + /* > >>>>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > >>>>> + * so we can save cookie into this bio after submit_bio(). > >>>>> + */ > >>>>> + if (queued) > >>>>> + bio_set_flag(bio, BIO_END_BY_POLL); > >>>>> + else > >>>>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > >>>>> + > >>>>> + return queued; > >>>>> +} > >>>> > >>>> The size of kfifo is limited, and it seems that once the sq of kfifio is > >>>> full, REQ_HIPRI flag is cleared and the corresponding bio is actually > >>>> enqueued into the default hw queue, which is IRQ driven. > >>> > >>> Yeah, this patch starts with 64 queue depth, and we can increase it to > >>> 128, which should cover most of cases. > >> > >> It seems that the queue depth of kfifo will affect the performance as I > >> did a fast test. > >> > >> > >> > >> Test Result: > >> > >> BLK_BIO_POLL_SQ_SZ | iodepth | IOPS > >> ------------------ | ------- | ---- > >> 64 | 128 | 301k (IRQ) -> 340k (iopoll) > >> 64 | 16 | 304k (IRQ) -> 392k (iopoll) > >> 128 | 128 | 204k (IRQ) -> 317k (iopoll) > >> 256 | 128 | 241k (IRQ) -> 391k (iopoll) > >> > >> It seems that BLK_BIO_POLL_SQ_SZ need to be increased accordingly when > >> iodepth is quite large. But I don't know why the performance in IRQ mode > >> decreases when BLK_BIO_POLL_SQ_SZ is increased. > > > > This patchset is supposed to not affect IRQ mode because HIPRI isn't set > > at IRQ mode. Or you mean '--hipri' & io_uring is setup but setting > > nvme.poll_queues as 0 at your 'IRQ' mode test? > > > > Thanks for starting to run performance test, and so far I just run test > > in KVM, not start performance test yet. > > > > 'IRQ' means 'hipri=0' of fio configuration. 'hipri=0' isn't supposed to be affected by this patchset. thanks, Ming -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
On Wed, Mar 17, 2021 at 11:49:00AM +0800, JeffleXu wrote: > > > On 3/16/21 7:00 PM, JeffleXu wrote: > > > > > > On 3/16/21 3:17 PM, Ming Lei wrote: > >> On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: > >>> It is a giant progress to gather all split bios that need to be polled > >>> in a per-task queue. Still some comments below. > >>> > >>> > >>> On 3/16/21 11:15 AM, Ming Lei wrote: > >>>> Currently bio based IO poll needs to poll all hw queue blindly, this way > >>>> is very inefficient, and the big reason is that we can't pass bio > >>>> submission result to io poll task. > >>>> > >>>> In IO submission context, store associated underlying bios into the > >>>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > >>>> and return current->pid to caller of submit_bio() for any DM or bio based > >>>> driver's IO, which is submitted from FS. > >>>> > >>>> In IO poll context, the passed cookie tells us the PID of submission > >>>> context, and we can find the bio from that submission context. Moving > >>>> bio from submission queue to poll queue of the poll context, and keep > >>>> polling until these bios are ended. Remove bio from poll queue if the > >>>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > >>>> > >>>> Usually submission shares context with io poll. The per-task poll context > >>>> is just like stack variable, and it is cheap to move data between the two > >>>> per-task queues. > >>>> > >>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > >>>> --- > >>>> block/bio.c | 5 ++ > >>>> block/blk-core.c | 74 +++++++++++++++++- > >>>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > >>>> include/linux/blk_types.h | 3 + > >>>> 4 files changed, 235 insertions(+), 3 deletions(-) > >>>> > >>>> diff --git a/block/bio.c b/block/bio.c > >>>> index a1c4d2900c7a..bcf5eca0e8e3 100644 > >>>> --- a/block/bio.c > >>>> +++ b/block/bio.c > >>>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > >>>> **/ > >>>> void bio_endio(struct bio *bio) > >>>> { > >>>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > >>>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { > >>>> + bio_set_flag(bio, BIO_DONE); > >>>> + return; > >>>> + } > >>>> again: > >>>> if (!bio_remaining_done(bio)) > >>>> return; > >>>> diff --git a/block/blk-core.c b/block/blk-core.c > >>>> index a082bbc856fb..970b23fa2e6e 100644 > >>>> --- a/block/blk-core.c > >>>> +++ b/block/blk-core.c > >>>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > >>>> bio->bi_opf |= REQ_TAG; > >>>> } > >>>> > >>>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > >>>> +{ > >>>> + struct blk_bio_poll_data data = { > >>>> + .bio = bio, > >>>> + }; > >>>> + struct blk_bio_poll_ctx *pc = ioc->data; > >>>> + unsigned int queued; > >>>> + > >>>> + /* lock is required if there is more than one writer */ > >>>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > >>>> + spin_lock(&pc->lock); > >>>> + queued = kfifo_put(&pc->sq, data); > >>>> + spin_unlock(&pc->lock); > >>>> + } else { > >>>> + queued = kfifo_put(&pc->sq, data); > >>>> + } > >>>> + > >>>> + /* > >>>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > >>>> + * so we can save cookie into this bio after submit_bio(). > >>>> + */ > >>>> + if (queued) > >>>> + bio_set_flag(bio, BIO_END_BY_POLL); > >>>> + else > >>>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > >>>> + > >>>> + return queued; > >>>> +} > >>> > >>> The size of kfifo is limited, and it seems that once the sq of kfifio is > >>> full, REQ_HIPRI flag is cleared and the corresponding bio is actually > >>> enqueued into the default hw queue, which is IRQ driven. > >> > >> Yeah, this patch starts with 64 queue depth, and we can increase it to > >> 128, which should cover most of cases. > >> > >>> > >>> > >>>> + > >>>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) > >>>> +{ > >>>> + bio->bi_iter.bi_private_data = cookie; > >>>> +} > >>>> + > >>>> static noinline_for_stack bool submit_bio_checks(struct bio *bio) > >>>> { > >>>> struct block_device *bdev = bio->bi_bdev; > >>>> @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) > >>>> * bio_list_on_stack[1] contains bios that were submitted before the current > >>>> * ->submit_bio_bio, but that haven't been processed yet. > >>>> */ > >>>> -static blk_qc_t __submit_bio_noacct(struct bio *bio) > >>>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) > >>>> { > >>>> struct bio_list bio_list_on_stack[2]; > >>>> blk_qc_t ret = BLK_QC_T_NONE; > >>>> @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > >>>> bio_list_on_stack[1] = bio_list_on_stack[0]; > >>>> bio_list_init(&bio_list_on_stack[0]); > >>>> > >>>> - ret = __submit_bio(bio); > >>>> + if (ioc && queue_is_mq(q) && > >>>> + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { > >>>> + bool queued = blk_bio_poll_prep_submit(ioc, bio); > >>>> + > >>>> + ret = __submit_bio(bio); > >>>> + if (queued) > >>>> + blk_bio_poll_post_submit(bio, ret); > >>>> + } else { > >>>> + ret = __submit_bio(bio); > >>>> + } > >>>> > >>>> /* > >>>> * Sort new bios into those for a lower level and those for the > >>>> @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > >>>> return ret; > >>>> } > >>>> > >>>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, > >>>> + struct io_context *ioc) > >>>> +{ > >>>> + struct blk_bio_poll_ctx *pc = ioc->data; > >>>> + int entries = kfifo_len(&pc->sq); > >>>> + > >>>> + __submit_bio_noacct_int(bio, ioc); > >>>> + > >>>> + /* bio submissions queued to per-task poll context */ > >>>> + if (kfifo_len(&pc->sq) > entries) > >>>> + return current->pid; > >>>> + > >>>> + /* swapper's pid is 0, but it can't submit poll IO for us */ > >>>> + return 0; > >>>> +} > >>>> + > >>>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) > >>>> +{ > >>>> + struct io_context *ioc = current->io_context; > >>>> + > >>>> + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) > >>>> + return __submit_bio_noacct_poll(bio, ioc); > >>>> + > >>>> + return __submit_bio_noacct_int(bio, NULL); > >>>> +} > >>>> + > >>>> + > >>>> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) > >>>> { > >>>> struct bio_list bio_list[2] = { }; > >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c > >>>> index 03f59915fe2c..4e6f1467d303 100644 > >>>> --- a/block/blk-mq.c > >>>> +++ b/block/blk-mq.c > >>>> @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, > >>>> return ret; > >>>> } > >>>> > >>>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio) > >>>> +{ > >>>> + return bio->bi_iter.bi_private_data; > >>>> +} > >>>> + > >>>> +static int blk_mq_poll_io(struct bio *bio) > >>>> +{ > >>>> + struct request_queue *q = bio->bi_bdev->bd_disk->queue; > >>>> + blk_qc_t cookie = bio_get_poll_cookie(bio); > >>>> + int ret = 0; > >>>> + > >>>> + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { > >>>> + struct blk_mq_hw_ctx *hctx = > >>>> + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; > >>>> + > >>>> + ret += blk_mq_poll_hctx(q, hctx); > >>>> + } > >>>> + return ret; > >>>> +} > >>>> + > >>>> +static int blk_bio_poll_and_end_io(struct request_queue *q, > >>>> + struct blk_bio_poll_ctx *poll_ctx) > >>>> +{ > >>>> + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; > >>>> + int ret = 0; > >>>> + int i; > >>>> + > >>>> + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { > >>>> + struct bio *bio = poll_data[i].bio; > >>>> + > >>>> + if (!bio) > >>>> + continue; > >>>> + > >>>> + ret += blk_mq_poll_io(bio); > >>>> + if (bio_flagged(bio, BIO_DONE)) { > >>>> + poll_data[i].bio = NULL; > >>>> + > >>>> + /* clear BIO_END_BY_POLL and end me really */ > >>>> + bio_clear_flag(bio, BIO_END_BY_POLL); > >>>> + bio_endio(bio); > >>>> + } > >>>> + } > >>>> + return ret; > >>>> +} > >>> > >>> When there are multiple threads polling, saying thread A and thread B, > >>> then there's one bio which should be polled by thread A (the pid is > >>> passed to thread A), while it's actually completed by thread B. In this > >>> case, when the bio is completed by thread B, the bio is not really > >>> completed and one extra blk_poll() still needs to be called. > >> > >> When this happens, the dm bio can't be completed, and the associated > >> kiocb can't be completed too, io_uring or other poll code context will > >> keep calling blk_poll() by passing thread A's pid until this dm bio is > >> done, since the dm bio is submitted from thread A. > >> > > > > This will affect the multi-thread polling performance. I tested > > dm-stripe, in which every bio will be split and enqueued into all > > underlying devices, and thus amplify the interference between multiple > > threads. > > > > Test Result: > > IOPS: 332k (IRQ) -> 363k (iopoll), aka ~10% performance gain > > Sorry this performance drop is not related to this bio refcount issue > here. Still it's due to the limited kfifo size. > > > I did another through test on another machine (aarch64 with more nvme > disks). > > - Without mentioned specifically, the configuration is 'iodepth=128, > kfifo queue depth =128'. > - The number before '->' indicates the IOPS in IRQ mode, i.e., > 'hipri=0', while the number after '->' indicates the IOPS in polling > mode, i.e., 'hipri=1'. > > ``` > 3-threads dm-linear-3 targets (4k randread IOPS, unit K) > 5.12-rc1: 667 > leiming: 674 -> 849 > ours 8353c1a: 623 -> 811 > > 3-threads dm-stripe-3 targets (12k randread IOPS, unit K) > 5.12-rc1: 321 > leiming: 313 -> 349 > leiming : 313 -> 409 (iodepth=32, kfifo queue depth =128) > leiming : 314 -> 409 (iodepth=128, kfifo queue depth =512) > ours 8353c1a: 310 -> 406 > > > 1-thread dm-linear-3 targets (4k randread IOPS, unit K) > 5.12-rc1: 224 > leiming: 218 -> 288 > ours 8353c1a: 210 -> 280 > > 1-threads dm-stripe-3 targets (12k randread IOPS, unit K) > 5.12-rc1: 109 > leiming: 107 -> 120 > leiming : 107 -> 145 (iodepth=32, kfifo queue depth =128) > leiming : 108 -> 145 (iodepth=128, kfifo queue depth =512) > ours 8353c1a: 107 -> 146 > ``` > > > Some hints: > > 1. When configured as 'iodepth=128, kfifo queue depth =128', dm-stripe > doesn't perform well in polling mode. It's because it's more likely that > the original bio will be split into split bios in dm-stripe, and thus > kfifo will be more likely used up in this case. So the size of kfifo > need to be tuned according to iodepth and the IO load. Thus exporting > the size of kfifo as a sysfs entry may be need in the following patch. Yeah, I think your analysis is right. On simple approach to address the scalability issue is to put submitted bio into a per-task list, however one new field(8bytes) needs to be added to bio, or something like below: 1) disable hipri bio merge, then we can reuse bio->bi_next or 2) track request instead of bio, then it should be easier to get one field from 'struct request' for such purpose, such as 'ipi_list'. Seems 2) is possible, will try it and see if the approach is really doable. thanks, Ming -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
On Wed, Mar 17 2021 at 3:19am -0400, Ming Lei <ming.lei@redhat.com> wrote: > On Wed, Mar 17, 2021 at 11:49:00AM +0800, JeffleXu wrote: > > > > > > On 3/16/21 7:00 PM, JeffleXu wrote: > > > > > > > > > On 3/16/21 3:17 PM, Ming Lei wrote: > > >> On Tue, Mar 16, 2021 at 02:46:08PM +0800, JeffleXu wrote: > > >>> It is a giant progress to gather all split bios that need to be polled > > >>> in a per-task queue. Still some comments below. > > >>> > > >>> > > >>> On 3/16/21 11:15 AM, Ming Lei wrote: > > >>>> Currently bio based IO poll needs to poll all hw queue blindly, this way > > >>>> is very inefficient, and the big reason is that we can't pass bio > > >>>> submission result to io poll task. > > >>>> > > >>>> In IO submission context, store associated underlying bios into the > > >>>> submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, > > >>>> and return current->pid to caller of submit_bio() for any DM or bio based > > >>>> driver's IO, which is submitted from FS. > > >>>> > > >>>> In IO poll context, the passed cookie tells us the PID of submission > > >>>> context, and we can find the bio from that submission context. Moving > > >>>> bio from submission queue to poll queue of the poll context, and keep > > >>>> polling until these bios are ended. Remove bio from poll queue if the > > >>>> bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. > > >>>> > > >>>> Usually submission shares context with io poll. The per-task poll context > > >>>> is just like stack variable, and it is cheap to move data between the two > > >>>> per-task queues. > > >>>> > > >>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> > > >>>> --- > > >>>> block/bio.c | 5 ++ > > >>>> block/blk-core.c | 74 +++++++++++++++++- > > >>>> block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- > > >>>> include/linux/blk_types.h | 3 + > > >>>> 4 files changed, 235 insertions(+), 3 deletions(-) > > >>>> > > >>>> diff --git a/block/bio.c b/block/bio.c > > >>>> index a1c4d2900c7a..bcf5eca0e8e3 100644 > > >>>> --- a/block/bio.c > > >>>> +++ b/block/bio.c > > >>>> @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) > > >>>> **/ > > >>>> void bio_endio(struct bio *bio) > > >>>> { > > >>>> + /* BIO_END_BY_POLL has to be set before calling submit_bio */ > > >>>> + if (bio_flagged(bio, BIO_END_BY_POLL)) { > > >>>> + bio_set_flag(bio, BIO_DONE); > > >>>> + return; > > >>>> + } > > >>>> again: > > >>>> if (!bio_remaining_done(bio)) > > >>>> return; > > >>>> diff --git a/block/blk-core.c b/block/blk-core.c > > >>>> index a082bbc856fb..970b23fa2e6e 100644 > > >>>> --- a/block/blk-core.c > > >>>> +++ b/block/blk-core.c > > >>>> @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, > > >>>> bio->bi_opf |= REQ_TAG; > > >>>> } > > >>>> > > >>>> +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) > > >>>> +{ > > >>>> + struct blk_bio_poll_data data = { > > >>>> + .bio = bio, > > >>>> + }; > > >>>> + struct blk_bio_poll_ctx *pc = ioc->data; > > >>>> + unsigned int queued; > > >>>> + > > >>>> + /* lock is required if there is more than one writer */ > > >>>> + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { > > >>>> + spin_lock(&pc->lock); > > >>>> + queued = kfifo_put(&pc->sq, data); > > >>>> + spin_unlock(&pc->lock); > > >>>> + } else { > > >>>> + queued = kfifo_put(&pc->sq, data); > > >>>> + } > > >>>> + > > >>>> + /* > > >>>> + * Now the bio is added per-task fifo, mark it as END_BY_POLL, > > >>>> + * so we can save cookie into this bio after submit_bio(). > > >>>> + */ > > >>>> + if (queued) > > >>>> + bio_set_flag(bio, BIO_END_BY_POLL); > > >>>> + else > > >>>> + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); > > >>>> + > > >>>> + return queued; > > >>>> +} > > >>> > > >>> The size of kfifo is limited, and it seems that once the sq of kfifio is > > >>> full, REQ_HIPRI flag is cleared and the corresponding bio is actually > > >>> enqueued into the default hw queue, which is IRQ driven. > > >> > > >> Yeah, this patch starts with 64 queue depth, and we can increase it to > > >> 128, which should cover most of cases. > > >> > > >>> > > >>> > > >>>> + > > >>>> +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) > > >>>> +{ > > >>>> + bio->bi_iter.bi_private_data = cookie; > > >>>> +} > > >>>> + > > >>>> static noinline_for_stack bool submit_bio_checks(struct bio *bio) > > >>>> { > > >>>> struct block_device *bdev = bio->bi_bdev; > > >>>> @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) > > >>>> * bio_list_on_stack[1] contains bios that were submitted before the current > > >>>> * ->submit_bio_bio, but that haven't been processed yet. > > >>>> */ > > >>>> -static blk_qc_t __submit_bio_noacct(struct bio *bio) > > >>>> +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) > > >>>> { > > >>>> struct bio_list bio_list_on_stack[2]; > > >>>> blk_qc_t ret = BLK_QC_T_NONE; > > >>>> @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > > >>>> bio_list_on_stack[1] = bio_list_on_stack[0]; > > >>>> bio_list_init(&bio_list_on_stack[0]); > > >>>> > > >>>> - ret = __submit_bio(bio); > > >>>> + if (ioc && queue_is_mq(q) && > > >>>> + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { > > >>>> + bool queued = blk_bio_poll_prep_submit(ioc, bio); > > >>>> + > > >>>> + ret = __submit_bio(bio); > > >>>> + if (queued) > > >>>> + blk_bio_poll_post_submit(bio, ret); > > >>>> + } else { > > >>>> + ret = __submit_bio(bio); > > >>>> + } > > >>>> > > >>>> /* > > >>>> * Sort new bios into those for a lower level and those for the > > >>>> @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) > > >>>> return ret; > > >>>> } > > >>>> > > >>>> +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, > > >>>> + struct io_context *ioc) > > >>>> +{ > > >>>> + struct blk_bio_poll_ctx *pc = ioc->data; > > >>>> + int entries = kfifo_len(&pc->sq); > > >>>> + > > >>>> + __submit_bio_noacct_int(bio, ioc); > > >>>> + > > >>>> + /* bio submissions queued to per-task poll context */ > > >>>> + if (kfifo_len(&pc->sq) > entries) > > >>>> + return current->pid; > > >>>> + > > >>>> + /* swapper's pid is 0, but it can't submit poll IO for us */ > > >>>> + return 0; > > >>>> +} > > >>>> + > > >>>> +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) > > >>>> +{ > > >>>> + struct io_context *ioc = current->io_context; > > >>>> + > > >>>> + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) > > >>>> + return __submit_bio_noacct_poll(bio, ioc); > > >>>> + > > >>>> + return __submit_bio_noacct_int(bio, NULL); > > >>>> +} > > >>>> + > > >>>> + > > >>>> static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) > > >>>> { > > >>>> struct bio_list bio_list[2] = { }; > > >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c > > >>>> index 03f59915fe2c..4e6f1467d303 100644 > > >>>> --- a/block/blk-mq.c > > >>>> +++ b/block/blk-mq.c > > >>>> @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, > > >>>> return ret; > > >>>> } > > >>>> > > >>>> +static blk_qc_t bio_get_poll_cookie(struct bio *bio) > > >>>> +{ > > >>>> + return bio->bi_iter.bi_private_data; > > >>>> +} > > >>>> + > > >>>> +static int blk_mq_poll_io(struct bio *bio) > > >>>> +{ > > >>>> + struct request_queue *q = bio->bi_bdev->bd_disk->queue; > > >>>> + blk_qc_t cookie = bio_get_poll_cookie(bio); > > >>>> + int ret = 0; > > >>>> + > > >>>> + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { > > >>>> + struct blk_mq_hw_ctx *hctx = > > >>>> + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; > > >>>> + > > >>>> + ret += blk_mq_poll_hctx(q, hctx); > > >>>> + } > > >>>> + return ret; > > >>>> +} > > >>>> + > > >>>> +static int blk_bio_poll_and_end_io(struct request_queue *q, > > >>>> + struct blk_bio_poll_ctx *poll_ctx) > > >>>> +{ > > >>>> + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; > > >>>> + int ret = 0; > > >>>> + int i; > > >>>> + > > >>>> + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { > > >>>> + struct bio *bio = poll_data[i].bio; > > >>>> + > > >>>> + if (!bio) > > >>>> + continue; > > >>>> + > > >>>> + ret += blk_mq_poll_io(bio); > > >>>> + if (bio_flagged(bio, BIO_DONE)) { > > >>>> + poll_data[i].bio = NULL; > > >>>> + > > >>>> + /* clear BIO_END_BY_POLL and end me really */ > > >>>> + bio_clear_flag(bio, BIO_END_BY_POLL); > > >>>> + bio_endio(bio); > > >>>> + } > > >>>> + } > > >>>> + return ret; > > >>>> +} > > >>> > > >>> When there are multiple threads polling, saying thread A and thread B, > > >>> then there's one bio which should be polled by thread A (the pid is > > >>> passed to thread A), while it's actually completed by thread B. In this > > >>> case, when the bio is completed by thread B, the bio is not really > > >>> completed and one extra blk_poll() still needs to be called. > > >> > > >> When this happens, the dm bio can't be completed, and the associated > > >> kiocb can't be completed too, io_uring or other poll code context will > > >> keep calling blk_poll() by passing thread A's pid until this dm bio is > > >> done, since the dm bio is submitted from thread A. > > >> > > > > > > This will affect the multi-thread polling performance. I tested > > > dm-stripe, in which every bio will be split and enqueued into all > > > underlying devices, and thus amplify the interference between multiple > > > threads. > > > > > > Test Result: > > > IOPS: 332k (IRQ) -> 363k (iopoll), aka ~10% performance gain > > > > Sorry this performance drop is not related to this bio refcount issue > > here. Still it's due to the limited kfifo size. > > > > > > I did another through test on another machine (aarch64 with more nvme > > disks). > > > > - Without mentioned specifically, the configuration is 'iodepth=128, > > kfifo queue depth =128'. > > - The number before '->' indicates the IOPS in IRQ mode, i.e., > > 'hipri=0', while the number after '->' indicates the IOPS in polling > > mode, i.e., 'hipri=1'. > > > > ``` > > 3-threads dm-linear-3 targets (4k randread IOPS, unit K) > > 5.12-rc1: 667 > > leiming: 674 -> 849 > > ours 8353c1a: 623 -> 811 > > > > 3-threads dm-stripe-3 targets (12k randread IOPS, unit K) > > 5.12-rc1: 321 > > leiming: 313 -> 349 > > leiming : 313 -> 409 (iodepth=32, kfifo queue depth =128) > > leiming : 314 -> 409 (iodepth=128, kfifo queue depth =512) > > ours 8353c1a: 310 -> 406 > > > > > > 1-thread dm-linear-3 targets (4k randread IOPS, unit K) > > 5.12-rc1: 224 > > leiming: 218 -> 288 > > ours 8353c1a: 210 -> 280 > > > > 1-threads dm-stripe-3 targets (12k randread IOPS, unit K) > > 5.12-rc1: 109 > > leiming: 107 -> 120 > > leiming : 107 -> 145 (iodepth=32, kfifo queue depth =128) > > leiming : 108 -> 145 (iodepth=128, kfifo queue depth =512) > > ours 8353c1a: 107 -> 146 > > ``` > > > > > > Some hints: > > > > 1. When configured as 'iodepth=128, kfifo queue depth =128', dm-stripe > > doesn't perform well in polling mode. It's because it's more likely that > > the original bio will be split into split bios in dm-stripe, and thus > > kfifo will be more likely used up in this case. So the size of kfifo > > need to be tuned according to iodepth and the IO load. Thus exporting > > the size of kfifo as a sysfs entry may be need in the following patch. > > Yeah, I think your analysis is right. > > On simple approach to address the scalability issue is to put submitted > bio into a per-task list, however one new field(8bytes) needs to be > added to bio, or something like below: > > 1) disable hipri bio merge, then we can reuse bio->bi_next > > or > > 2) track request instead of bio, then it should be easier to get one > field from 'struct request' for such purpose, such as 'ipi_list'. > > Seems 2) is possible, will try it and see if the approach is really doable. Not (yet) seeing how making tracking (either requests or bios) per-task will help. Though tracking in terms of requests reduces the amount of polling (due to hopeful merging, at least in sequential IO case) it doesn't _really_ make the task cookie -> polled_object mapping any more efficient for the single thread test-case Jeffle ran: the fan-out of bio-splits for _random_ IO issued to 3-way dm-stripe is inherently messy to track. Basically I'm just wondering where you see your per-task request-based tracking approach helping? Multithreaded sequential workloads? Feels like the poll cookie being a task id is just extremely coarse. Doesn't really allow polling to be done more precisely... what am I missing? Thanks, Mike -- dm-devel mailing list dm-devel@redhat.com https://listman.redhat.com/mailman/listinfo/dm-devel
diff --git a/block/bio.c b/block/bio.c index a1c4d2900c7a..bcf5eca0e8e3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) **/ void bio_endio(struct bio *bio) { + /* BIO_END_BY_POLL has to be set before calling submit_bio */ + if (bio_flagged(bio, BIO_END_BY_POLL)) { + bio_set_flag(bio, BIO_DONE); + return; + } again: if (!bio_remaining_done(bio)) return; diff --git a/block/blk-core.c b/block/blk-core.c index a082bbc856fb..970b23fa2e6e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -854,6 +854,40 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, bio->bi_opf |= REQ_TAG; } +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) +{ + struct blk_bio_poll_data data = { + .bio = bio, + }; + struct blk_bio_poll_ctx *pc = ioc->data; + unsigned int queued; + + /* lock is required if there is more than one writer */ + if (unlikely(atomic_read(&ioc->nr_tasks) > 1)) { + spin_lock(&pc->lock); + queued = kfifo_put(&pc->sq, data); + spin_unlock(&pc->lock); + } else { + queued = kfifo_put(&pc->sq, data); + } + + /* + * Now the bio is added per-task fifo, mark it as END_BY_POLL, + * so we can save cookie into this bio after submit_bio(). + */ + if (queued) + bio_set_flag(bio, BIO_END_BY_POLL); + else + bio->bi_opf &= ~(REQ_HIPRI | REQ_TAG); + + return queued; +} + +static void blk_bio_poll_post_submit(struct bio *bio, blk_qc_t cookie) +{ + bio->bi_iter.bi_private_data = cookie; +} + static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; @@ -1008,7 +1042,7 @@ static blk_qc_t __submit_bio(struct bio *bio) * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ -static blk_qc_t __submit_bio_noacct(struct bio *bio) +static blk_qc_t __submit_bio_noacct_int(struct bio *bio, struct io_context *ioc) { struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; @@ -1031,7 +1065,16 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = __submit_bio(bio); + if (ioc && queue_is_mq(q) && + (bio->bi_opf & (REQ_HIPRI | REQ_TAG))) { + bool queued = blk_bio_poll_prep_submit(ioc, bio); + + ret = __submit_bio(bio); + if (queued) + blk_bio_poll_post_submit(bio, ret); + } else { + ret = __submit_bio(bio); + } /* * Sort new bios into those for a lower level and those for the @@ -1057,6 +1100,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) return ret; } +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, + struct io_context *ioc) +{ + struct blk_bio_poll_ctx *pc = ioc->data; + int entries = kfifo_len(&pc->sq); + + __submit_bio_noacct_int(bio, ioc); + + /* bio submissions queued to per-task poll context */ + if (kfifo_len(&pc->sq) > entries) + return current->pid; + + /* swapper's pid is 0, but it can't submit poll IO for us */ + return 0; +} + +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) +{ + struct io_context *ioc = current->io_context; + + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) + return __submit_bio_noacct_poll(bio, ioc); + + return __submit_bio_noacct_int(bio, NULL); +} + + static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 03f59915fe2c..4e6f1467d303 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3865,14 +3865,168 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, return ret; } +static blk_qc_t bio_get_poll_cookie(struct bio *bio) +{ + return bio->bi_iter.bi_private_data; +} + +static int blk_mq_poll_io(struct bio *bio) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + blk_qc_t cookie = bio_get_poll_cookie(bio); + int ret = 0; + + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { + struct blk_mq_hw_ctx *hctx = + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + + ret += blk_mq_poll_hctx(q, hctx); + } + return ret; +} + +static int blk_bio_poll_and_end_io(struct request_queue *q, + struct blk_bio_poll_ctx *poll_ctx) +{ + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; + int ret = 0; + int i; + + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { + struct bio *bio = poll_data[i].bio; + + if (!bio) + continue; + + ret += blk_mq_poll_io(bio); + if (bio_flagged(bio, BIO_DONE)) { + poll_data[i].bio = NULL; + + /* clear BIO_END_BY_POLL and end me really */ + bio_clear_flag(bio, BIO_END_BY_POLL); + bio_endio(bio); + } + } + return ret; +} + +static int __blk_bio_poll_io(struct request_queue *q, + struct blk_bio_poll_ctx *submit_ctx, + struct blk_bio_poll_ctx *poll_ctx) +{ + struct blk_bio_poll_data *poll_data = &poll_ctx->pq[0]; + int i; + + /* + * Move IO submission result from submission queue in submission + * context to poll queue of poll context. + * + * There may be more than one readers on poll queue of the same + * submission context, so have to lock here. + */ + spin_lock(&submit_ctx->lock); + for (i = 0; i < BLK_BIO_POLL_PQ_SZ; i++) { + if (poll_data[i].bio == NULL && + !kfifo_get(&submit_ctx->sq, &poll_data[i])) + break; + } + spin_unlock(&submit_ctx->lock); + + return blk_bio_poll_and_end_io(q, poll_ctx); +} + +static int blk_bio_poll_io(struct request_queue *q, + struct io_context *submit_ioc, + struct io_context *poll_ioc) +{ + struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data; + struct blk_bio_poll_ctx *poll_ctx = poll_ioc->data; + int ret; + + if (unlikely(atomic_read(&poll_ioc->nr_tasks) > 1)) { + mutex_lock(&poll_ctx->pq_lock); + ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx); + mutex_unlock(&poll_ctx->pq_lock); + } else { + ret = __blk_bio_poll_io(q, submit_ctx, poll_ctx); + } + return ret; +} + +static bool blk_bio_ioc_valid(struct task_struct *t) +{ + if (!t) + return false; + + if (!t->io_context) + return false; + + if (!t->io_context->data) + return false; + + return true; +} + +static int __blk_bio_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct io_context *poll_ioc = current->io_context; + pid_t pid; + struct task_struct *submit_task; + int ret; + + pid = (pid_t)cookie; + + /* io poll often share io submission context */ + if (likely(current->pid == pid && blk_bio_ioc_valid(current))) + return blk_bio_poll_io(q, poll_ioc, poll_ioc); + + submit_task = find_get_task_by_vpid(pid); + if (likely(blk_bio_ioc_valid(submit_task))) + ret = blk_bio_poll_io(q, submit_task->io_context, + poll_ioc); + else + ret = 0; + + put_task_struct(submit_task); + + return ret; +} + static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { + long state; + + /* no need to poll */ + if (cookie == 0) + return 0; + /* * Create poll queue for storing poll bio and its cookie from * submission queue */ blk_create_io_context(q, true); + state = current->state; + do { + int ret; + + ret = __blk_bio_poll(q, cookie); + if (ret > 0) { + __set_current_state(TASK_RUNNING); + return ret; + } + + if (signal_pending_state(state, current)) + __set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return 1; + if (ret < 0 || !spin) + break; + cpu_relax(); + } while (!need_resched()); + + __set_current_state(TASK_RUNNING); return 0; } @@ -3893,7 +4047,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) struct blk_mq_hw_ctx *hctx; long state; - if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q)) + if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie))) return 0; if (current->plug) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1bcade4bcc3..53f64eea9652 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -304,6 +304,9 @@ enum { BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ BIO_REMAPPED, + BIO_END_BY_POLL, /* end by blk_bio_poll() explicitly */ + /* set when bio can be ended, used for bio with BIO_END_BY_POLL */ + BIO_DONE, BIO_FLAG_LAST };
Currently bio based IO poll needs to poll all hw queue blindly, this way is very inefficient, and the big reason is that we can't pass bio submission result to io poll task. In IO submission context, store associated underlying bios into the submission queue and save 'cookie' poll data in bio->bi_iter.bi_private_data, and return current->pid to caller of submit_bio() for any DM or bio based driver's IO, which is submitted from FS. In IO poll context, the passed cookie tells us the PID of submission context, and we can find the bio from that submission context. Moving bio from submission queue to poll queue of the poll context, and keep polling until these bios are ended. Remove bio from poll queue if the bio is ended. Add BIO_DONE and BIO_END_BY_POLL for such purpose. Usually submission shares context with io poll. The per-task poll context is just like stack variable, and it is cheap to move data between the two per-task queues. Signed-off-by: Ming Lei <ming.lei@redhat.com> --- block/bio.c | 5 ++ block/blk-core.c | 74 +++++++++++++++++- block/blk-mq.c | 156 +++++++++++++++++++++++++++++++++++++- include/linux/blk_types.h | 3 + 4 files changed, 235 insertions(+), 3 deletions(-)