[V10,07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive

Message ID	20200505020930.1146281-8-ming.lei@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <SRS0=vpK8=6T=vger.kernel.org=linux-block-owner@kernel.org> From: Ming Lei <ming.lei@redhat.com> To: Jens Axboe <axboe@kernel.dk> Cc: linux-block@vger.kernel.org, Ming Lei <ming.lei@redhat.com>, John Garry <john.garry@huawei.com>, Bart Van Assche <bvanassche@acm.org>, Hannes Reinecke <hare@suse.com>, Christoph Hellwig <hch@lst.de>, Thomas Gleixner <tglx@linutronix.de> Subject: [PATCH V10 07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive Date: Tue, 5 May 2020 10:09:26 +0800 Message-Id: <20200505020930.1146281-8-ming.lei@redhat.com> In-Reply-To: <20200505020930.1146281-1-ming.lei@redhat.com> References: <20200505020930.1146281-1-ming.lei@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Sender: linux-block-owner@vger.kernel.org Precedence: bulk
Series	blk-mq: improvement CPU hotplug \| expand [V10,00/11] blk-mq: improvement CPU hotplug [V10,01/11] block: clone nr_integrity_segments and write_hint in blk_rq_prep_clone [V10,02/11] block: add helper for copying request [V10,03/11] blk-mq: mark blk_mq_get_driver_tag as static [V10,04/11] blk-mq: assign rq->tag in blk_mq_get_driver_tag [V10,05/11] blk-mq: support rq filter callback when iterating rqs [V10,06/11] blk-mq: prepare for draining IO when hctx's all CPUs are offline [V10,07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive [V10,08/11] block: add blk_end_flush_machinery [V10,09/11] blk-mq: add blk_mq_hctx_handle_dead_cpu for handling cpu dead [V10,10/11] blk-mq: re-submit IO in case that hctx is inactive [V10,11/11] block: deactivate hctx when the hctx is actually inactive

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index ddec58743e88..dc66cb689d2f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), + HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 54c107be7a47..4a2250ac4fbb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1038,11 +1038,36 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -static bool blk_mq_get_driver_tag(struct request *rq) +static bool blk_mq_get_driver_tag(struct request *rq, bool direct_issue) { if (rq->tag != -1) return true; - return __blk_mq_get_driver_tag(rq); + + if (!__blk_mq_get_driver_tag(rq)) + return false; + /* + * In case that direct issue IO process is migrated to other CPU + * which may not belong to this hctx, add one memory barrier so we + * can order driver tag assignment and checking BLK_MQ_S_INACTIVE. + * Otherwise, barrier() is enough given both setting BLK_MQ_S_INACTIVE + * and driver tag assignment are run on the same CPU because + * BLK_MQ_S_INACTIVE is only set after the last CPU of this hctx is + * becoming offline. + * + * Process migration might happen after the check on current processor + * id, smp_mb() is implied by processor migration, so no need to worry + * about it. + */ + if (unlikely(direct_issue && rq->mq_ctx->cpu != raw_smp_processor_id())) + smp_mb(); + else + barrier(); + + if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))) { + blk_mq_put_driver_tag(rq); + return false; + } + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, @@ -1091,7 +1116,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ - return blk_mq_get_driver_tag(rq); + return blk_mq_get_driver_tag(rq, false); } wait = &hctx->dispatch_wait; @@ -1117,7 +1142,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * allocation failure and adding the hardware queue to the wait * queue. */ - ret = blk_mq_get_driver_tag(rq); + ret = blk_mq_get_driver_tag(rq, false); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); @@ -1218,7 +1243,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, break; } - if (!blk_mq_get_driver_tag(rq)) { + if (!blk_mq_get_driver_tag(rq, false)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1250,7 +1275,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = true; else { nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt); + bd.last = !blk_mq_get_driver_tag(nxt, false); } ret = q->mq_ops->queue_rq(hctx, &bd); @@ -1864,7 +1889,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_driver_tag(rq)) { + if (!blk_mq_get_driver_tag(rq, true)) { blk_mq_put_dispatch_budget(hctx); goto insert; } @@ -2273,13 +2298,87 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, return -ENOMEM; } -static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +struct count_inflight_data { + unsigned count; + struct blk_mq_hw_ctx *hctx; +}; + +static bool blk_mq_count_inflight_rq(struct request *rq, void *data, + bool reserved) { - return 0; + struct count_inflight_data *count_data = data; + + /* + * Can't check rq's state because it is updated to MQ_RQ_IN_FLIGHT + * in blk_mq_start_request(), at that time we can't prevent this rq + * from being issued. + * + * So check if driver tag is assigned, if yes, count this rq as + * inflight. + */ + if (rq->tag >= 0 && rq->mq_hctx == count_data->hctx) + count_data->count++; + + return true; +} + +static bool blk_mq_inflight_rq(struct request *rq, void *data, + bool reserved) +{ + return rq->tag >= 0; +} + +static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct count_inflight_data count_data = { + .hctx = hctx, + }; + + blk_mq_all_tag_busy_iter(hctx->tags, blk_mq_count_inflight_rq, + blk_mq_inflight_rq, &count_data); + return count_data.count; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + + if (!blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Order setting BLK_MQ_S_INACTIVE versus checking rq->tag and rqs[tag], + * in blk_mq_tags_inflight_rqs. It pairs with the smp_mb() in + * blk_mq_get_driver_tag. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + while (blk_mq_tags_inflight_rqs(hctx)) + msleep(5); + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } @@ -2290,12 +2389,15 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_dead); struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; - hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3763207d88eb..77bf861d72ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -403,6 +403,9 @@ enum { BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE = 3, + BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8,

[V10,07/11] blk-mq: stop to handle IO and drain IO before hctx becomes inactive

Commit Message

Comments

Patch