[V5,08/12] block: use per-task poll context to implement bio based io polling

Message ID	20210401021927.343727-9-ming.lei@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Ming Lei <ming.lei@redhat.com> To: Jens Axboe <axboe@kernel.dk> Cc: linux-block@vger.kernel.org, Jeffle Xu <jefflexu@linux.alibaba.com>, Mike Snitzer <snitzer@redhat.com>, dm-devel@redhat.com, Hannes Reinecke <hare@suse.de>, Ming Lei <ming.lei@redhat.com> Subject: [PATCH V5 08/12] block: use per-task poll context to implement bio based io polling Date: Thu, 1 Apr 2021 10:19:23 +0800 Message-Id: <20210401021927.343727-9-ming.lei@redhat.com> In-Reply-To: <20210401021927.343727-1-ming.lei@redhat.com> References: <20210401021927.343727-1-ming.lei@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	block: support bio based io polling \| expand [V5,00/12] block: support bio based io polling [V5,01/12] block: add helper of blk_queue_poll [V5,02/12] block: add one helper to free io_context [V5,03/12] block: create io poll context for submission and poll task [V5,04/12] block: add req flag of REQ_POLL_CTX [V5,05/12] block: add new field into 'struct bvec_iter' [V5,06/12] block/mq: extract one helper function polling hw queue [V5,07/12] block: prepare for supporting bio_list via other link [V5,08/12] block: use per-task poll context to implement bio based io polling [V5,09/12] blk-mq: limit hw queues to be polled in each blk_poll() [V5,10/12] block: add queue_to_disk() to get gendisk from request_queue [V5,11/12] block: add poll_capable method to support bio-based IO polling [V5,12/12] dm: support IO polling for bio-based dm device

diff --git a/block/bio.c b/block/bio.c index 26b7f721cda8..04c043dc60fc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1402,6 +1402,11 @@ static inline bool bio_remaining_done(struct bio *bio) **/ void bio_endio(struct bio *bio) { + /* BIO_END_BY_POLL has to be set before calling submit_bio */ + if (bio_flagged(bio, BIO_END_BY_POLL)) { + bio_set_flag(bio, BIO_DONE); + return; + } again: if (!bio_remaining_done(bio)) return; diff --git a/block/blk-core.c b/block/blk-core.c index a777ba4fe06f..760e7d96f258 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -805,6 +805,81 @@ static inline unsigned int bio_grp_list_size(unsigned int nr_grps) sizeof(struct bio_grp_list_data); } +static inline void *bio_grp_data(struct bio *bio) +{ + return bio->bi_poll; +} + +/* add bio into bio group list, return true if it is added */ +static bool bio_grp_list_add(struct bio_grp_list *list, struct bio *bio) +{ + int i; + struct bio_grp_list_data *grp; + + for (i = 0; i < list->nr_grps; i++) { + grp = &list->head[i]; + if (grp->grp_data == bio_grp_data(bio)) { + __bio_grp_list_add(&grp->list, bio); + return true; + } + } + + if (i == list->max_nr_grps) + return false; + + /* create a new group */ + grp = &list->head[i]; + bio_list_init(&grp->list); + grp->grp_data = bio_grp_data(bio); + __bio_grp_list_add(&grp->list, bio); + list->nr_grps++; + + return true; +} + +static int bio_grp_list_find_grp(struct bio_grp_list *list, void *grp_data) +{ + int i; + struct bio_grp_list_data *grp; + + for (i = 0; i < list->nr_grps; i++) { + grp = &list->head[i]; + if (grp->grp_data == grp_data) + return i; + } + + if (i < list->max_nr_grps) { + grp = &list->head[i]; + bio_list_init(&grp->list); + return i; + } + + return -1; +} + +/* Move as many as possible groups from 'src' to 'dst' */ +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src) +{ + int i, j, cnt = 0; + struct bio_grp_list_data *grp; + + for (i = src->nr_grps - 1; i >= 0; i--) { + grp = &src->head[i]; + j = bio_grp_list_find_grp(dst, grp->grp_data); + if (j < 0) + break; + if (bio_grp_list_grp_empty(&dst->head[j])) { + dst->head[j].grp_data = grp->grp_data; + dst->nr_grps++; + } + __bio_grp_list_merge(&dst->head[j].list, &grp->list); + bio_list_init(&grp->list); + cnt++; + } + + src->nr_grps -= cnt; +} + static void bio_poll_ctx_init(struct blk_bio_poll_ctx *pc) { pc->sq = (void *)pc + sizeof(*pc); @@ -866,6 +941,47 @@ static inline void blk_bio_poll_preprocess(struct request_queue *q, bio->bi_opf |= REQ_POLL_CTX; } +static inline void blk_bio_poll_mark_queued(struct bio *bio, bool queued) +{ + /* + * The bio has been added to per-task poll queue, mark it as + * END_BY_POLL, so that this bio is always completed from + * blk_poll() which is provided with cookied from this bio's + * submission. + */ + if (!queued) + bio->bi_opf &= ~(REQ_HIPRI | REQ_POLL_CTX); + else + bio_set_flag(bio, BIO_END_BY_POLL); +} + +static bool blk_bio_poll_prep_submit(struct io_context *ioc, struct bio *bio) +{ + struct blk_bio_poll_ctx *pc = ioc->data; + unsigned int queued; + + /* + * We rely on immutable .bi_end_io between blk-mq bio submission + * and completion. However, bio crypt may update .bi_end_io during + * submission, so simply don't support bio based polling for this + * setting. + */ + if (likely(!bio_has_crypt_ctx(bio))) { + /* track this bio via bio group list */ + spin_lock(&pc->sq_lock); + queued = bio_grp_list_add(pc->sq, bio); + blk_bio_poll_mark_queued(bio, queued); + if (queued) + bio_set_private_data(bio, BLK_QC_T_NOT_READY); + spin_unlock(&pc->sq_lock); + } else { + queued = false; + blk_bio_poll_mark_queued(bio, false); + } + + return queued; +} + static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; @@ -1024,7 +1140,7 @@ static blk_qc_t __submit_bio(struct bio *bio) * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ -static blk_qc_t __submit_bio_noacct(struct bio *bio) +static blk_qc_t __submit_bio_noacct_ctx(struct bio *bio, struct io_context *ioc) { struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; @@ -1047,7 +1163,15 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = __submit_bio(bio); + if (ioc && queue_is_mq(q) && (bio->bi_opf & REQ_HIPRI)) { + bool queued = blk_bio_poll_prep_submit(ioc, bio); + + ret = __submit_bio(bio); + if (queued) + bio_set_private_data(bio, ret); + } else { + ret = __submit_bio(bio); + } /* * Sort new bios into those for a lower level and those for the @@ -1073,6 +1197,33 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) return ret; } +static inline blk_qc_t __submit_bio_noacct_poll(struct bio *bio, + struct io_context *ioc) +{ + struct blk_bio_poll_ctx *pc = ioc->data; + + __submit_bio_noacct_ctx(bio, ioc); + + /* bio submissions queued to per-task poll context */ + if (READ_ONCE(pc->sq->nr_grps)) + return current->pid; + + /* swapper's pid is 0, but it can't submit poll IO for us */ + return BLK_QC_T_BIO_NONE; +} + +static inline blk_qc_t __submit_bio_noacct(struct bio *bio) +{ + struct io_context *ioc = current->io_context; + + if (ioc && ioc->data && (bio->bi_opf & REQ_HIPRI)) + return __submit_bio_noacct_poll(bio, ioc); + + __submit_bio_noacct_ctx(bio, NULL); + + return BLK_QC_T_BIO_NONE; +} + static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5574c398eff6..c1fd7c593a54 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -211,6 +211,9 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); + /* drain io poll submissions */ + blk_bio_poll_io_drain(ioc); + atomic_dec(&ioc->nr_tasks); put_io_context_active(ioc); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 0cb88c719916..20bfc8c2d02e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3865,14 +3865,256 @@ static inline int blk_mq_poll_hctx(struct request_queue *q, return ret; } +static int blk_mq_poll_io(struct bio *bio) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + blk_qc_t cookie = bio_get_private_data(bio); + int ret = 0; + + /* wait until the bio is submitted really */ + if (!blk_qc_t_ready(cookie)) + return 0; + + if (!bio_flagged(bio, BIO_DONE) && blk_qc_t_valid(cookie)) { + struct blk_mq_hw_ctx *hctx = + q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + + ret += blk_mq_poll_hctx(q, hctx); + } + return ret; +} + +static int blk_bio_poll_and_end_io(struct bio_grp_list *grps) +{ + int ret = 0; + int i; + + /* + * Poll hw queue first. + * + * TODO: limit max poll times and make sure to not poll same + * hw queue one more time. + */ + for (i = 0; i < grps->nr_grps; i++) { + struct bio_grp_list_data *grp = &grps->head[i]; + struct bio *bio; + + if (bio_grp_list_grp_empty(grp)) + continue; + + for (bio = grp->list.head; bio; bio = bio->bi_poll) + ret += blk_mq_poll_io(bio); + } + + /* reap bios */ + for (i = 0; i < grps->nr_grps; i++) { + struct bio_grp_list_data *grp = &grps->head[i]; + struct bio *bio; + struct bio_list bl; + + if (bio_grp_list_grp_empty(grp)) + continue; + + bio_list_init(&bl); + + while ((bio = __bio_grp_list_pop(&grp->list))) { + if (bio_flagged(bio, BIO_DONE)) { + /* now recover original data */ + bio->bi_poll = grp->grp_data; + + /* clear BIO_END_BY_POLL and end me really */ + bio_clear_flag(bio, BIO_END_BY_POLL); + bio_endio(bio); + } else { + __bio_grp_list_add(&bl, bio); + } + } + __bio_grp_list_merge(&grp->list, &bl); + } + return ret; +} + +static void blk_bio_poll_pack_groups(struct bio_grp_list *grps) +{ + int i, j, k = 0; + int cnt = 0; + + for (i = grps->nr_grps - 1; i >= 0; i--) { + struct bio_grp_list_data *grp = &grps->head[i]; + struct bio_grp_list_data *hole = NULL; + + if (bio_grp_list_grp_empty(grp)) { + cnt++; + continue; + } + + for (j = k; j < i; j++) { + if (bio_grp_list_grp_empty(&grps->head[j])) { + hole = &grps->head[j]; + break; + } + } + if (hole == NULL) + break; + *hole = *grp; + cnt++; + k = j; + } + + grps->nr_grps -= cnt; +} + +#define MAX_BIO_GRPS_ON_STACK 8 +struct bio_grp_list_stack { + unsigned int max_nr_grps, nr_grps; + struct bio_grp_list_data head[MAX_BIO_GRPS_ON_STACK]; +}; + +static int blk_bio_poll_io(struct io_context *submit_ioc, + struct io_context *poll_ioc) + +{ + struct bio_grp_list_stack _bio_grps = { + .max_nr_grps = ARRAY_SIZE(_bio_grps.head), + .nr_grps = 0 + }; + struct bio_grp_list *bio_grps = (struct bio_grp_list *)&_bio_grps; + struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data; + struct blk_bio_poll_ctx *poll_ctx = poll_ioc ? + poll_ioc->data : NULL; + int ret = 0; + + /* + * Move IO submission result from submission queue in submission + * context to poll queue of poll context. + */ + if (READ_ONCE(submit_ctx->sq->nr_grps) > 0) { + spin_lock(&submit_ctx->sq_lock); + bio_grp_list_move(bio_grps, submit_ctx->sq); + spin_unlock(&submit_ctx->sq_lock); + } + + /* merge new bios first, then start to poll bios from pq */ + if (poll_ctx) { + spin_lock(&poll_ctx->pq_lock); + bio_grp_list_move(poll_ctx->pq, bio_grps); + bio_grp_list_move(bio_grps, poll_ctx->pq); + spin_unlock(&poll_ctx->pq_lock); + } + + do { + ret += blk_bio_poll_and_end_io(bio_grps); + blk_bio_poll_pack_groups(bio_grps); + + if (bio_grps->nr_grps) { + /* + * move back, and keep polling until all can be + * held in either poll queue or submission queue. + */ + if (poll_ctx) { + spin_lock(&poll_ctx->pq_lock); + bio_grp_list_move(poll_ctx->pq, bio_grps); + spin_unlock(&poll_ctx->pq_lock); + } else { + spin_lock(&submit_ctx->sq_lock); + bio_grp_list_move(submit_ctx->sq, bio_grps); + spin_unlock(&submit_ctx->sq_lock); + } + } + } while (bio_grps->nr_grps > 0); + + return ret; +} + +void blk_bio_poll_io_drain(struct io_context *submit_ioc) +{ + struct blk_bio_poll_ctx *submit_ctx = submit_ioc->data; + + if (!submit_ctx) + return; + + spin_lock(&submit_ctx->sq_lock); + while (READ_ONCE(submit_ctx->sq->nr_grps) > 0) { + blk_bio_poll_and_end_io(submit_ctx->sq); + blk_bio_poll_pack_groups(submit_ctx->sq); + cpu_relax(); + } + spin_unlock(&submit_ctx->sq_lock); +} + +static bool blk_bio_ioc_valid(struct task_struct *t) +{ + if (!t) + return false; + + if (!t->io_context) + return false; + + if (!t->io_context->data) + return false; + + return true; +} + +static int __blk_bio_poll(blk_qc_t cookie) +{ + struct io_context *poll_ioc = current->io_context; + pid_t pid; + struct task_struct *submit_task; + int ret; + + pid = (pid_t)cookie; + + /* io poll often share io submission context */ + if (likely(current->pid == pid && blk_bio_ioc_valid(current))) + return blk_bio_poll_io(poll_ioc, poll_ioc); + + submit_task = find_get_task_by_vpid(pid); + if (likely(blk_bio_ioc_valid(submit_task))) + ret = blk_bio_poll_io(submit_task->io_context, poll_ioc); + else + ret = 0; + if (likely(submit_task)) + put_task_struct(submit_task); + + return ret; +} + static int blk_bio_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { + long state; + + /* no need to poll */ + if (cookie == BLK_QC_T_BIO_NONE) + return 0; + /* * Create poll queue for storing poll bio and its cookie from * submission queue */ blk_create_io_poll_context(q); + state = current->state; + do { + int ret; + + ret = __blk_bio_poll(cookie); + if (ret > 0) { + __set_current_state(TASK_RUNNING); + return ret; + } + + if (signal_pending_state(state, current)) + __set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return 1; + if (ret < 0 || !spin) + break; + cpu_relax(); + } while (!need_resched()); + + __set_current_state(TASK_RUNNING); return 0; } @@ -3893,7 +4135,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) struct blk_mq_hw_ctx *hctx; long state; - if (!blk_qc_t_valid(cookie) || !blk_queue_poll(q)) + if (!blk_queue_poll(q) || (queue_is_mq(q) && !blk_qc_t_valid(cookie))) return 0; if (current->plug) diff --git a/block/blk.h b/block/blk.h index c1d8456656df..c0216f2e6293 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,6 +381,7 @@ struct blk_bio_poll_ctx { #define BLK_BIO_POLL_SQ_SZ 16U #define BLK_BIO_POLL_PQ_SZ (BLK_BIO_POLL_SQ_SZ * 2) +void blk_bio_poll_io_drain(struct io_context *submit_ioc); void bio_poll_ctx_alloc(struct io_context *ioc); static inline void blk_create_io_poll_context(struct request_queue *q) @@ -405,4 +406,13 @@ static inline void bio_set_private_data(struct bio *bio, unsigned int data) bio->bi_iter.bi_private_data = data; } +BIO_LIST_HELPERS(__bio_grp_list, poll); + +static inline bool bio_grp_list_grp_empty(struct bio_grp_list_data *grp) +{ + return bio_list_empty(&grp->list); +} + +void bio_grp_list_move(struct bio_grp_list *dst, struct bio_grp_list *src); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 99160d588c2d..178af164ba2a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -235,7 +235,18 @@ struct bio { struct bvec_iter bi_iter; - bio_end_io_t *bi_end_io; + union { + bio_end_io_t *bi_end_io; + /* + * bio based io polling needs to track bio via bio group + * list which groups bios by their .bi_end_io, and original + * .bi_end_io is saved into the group head. Will recover + * .bi_end_io before really ending bio. BIO_END_BY_POLL + * will make sure that this bio won't be ended before + * recovering .bi_end_io. + */ + struct bio *bi_poll; + }; void *bi_private; #ifdef CONFIG_BLK_CGROUP @@ -304,6 +315,9 @@ enum { BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ BIO_REMAPPED, + BIO_END_BY_POLL, /* end by blk_bio_poll() explicitly */ + /* set when bio can be ended, used for bio with BIO_END_BY_POLL */ + BIO_DONE, BIO_FLAG_LAST }; @@ -513,6 +527,16 @@ typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) +/* only used for bio based submission, has to be defined as 0 */ +#define BLK_QC_T_BIO_NONE 0 +/* only used for bio based polling, not ready for polling */ +#define BLK_QC_T_NOT_READY -2U + +/* not ready for bio based polling since this bio isn't submitted really */ +static inline bool blk_qc_t_ready(blk_qc_t cookie) +{ + return cookie != BLK_QC_T_NOT_READY; +} static inline bool blk_qc_t_valid(blk_qc_t cookie) {

[V5,08/12] block: use per-task poll context to implement bio based io polling

Commit Message

Comments

Patch