diff mbox series

[V6,4/8] blk-mq: stop to handle IO and drain IO before hctx becomes inactive

Message ID 20200407092901.314228-5-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: improvement CPU hotplug | expand

Commit Message

Ming Lei April 7, 2020, 9:28 a.m. UTC
Before one CPU becomes offline, check if it is the last online CPU
of hctx. If yes, mark this hctx as inactive, meantime wait for
completion of all in-flight IOs originated from this hctx.

This way guarantees that there isn't any inflight IO before shutdowning
the managed IRQ line.

Cc: John Garry <john.garry@huawei.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c |  2 +-
 block/blk-mq-tag.h |  2 ++
 block/blk-mq.c     | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 586c9d6e904a..82a58b2cebe7 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -317,7 +317,7 @@  static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
  *		true to continue iterating tags, false to stop.
  * @priv:	Will be passed as second argument to @fn.
  */
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2b8321efb682..346d570d52a9 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -34,6 +34,8 @@  extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+		busy_tag_iter_fn *fn, void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 						 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4ee8695142c0..aac86cd99f02 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1054,6 +1054,11 @@  bool blk_mq_get_driver_tag(struct request *rq)
 		data.hctx->tags->rqs[rq->tag] = rq;
 	}
 
+	if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data.hctx->state))) {
+		blk_mq_put_driver_tag(rq);
+		return false;
+	}
+
 	return rq->tag != -1;
 }
 
@@ -2249,8 +2254,68 @@  int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	return -ENOMEM;
 }
 
+struct count_inflight_data {
+	unsigned count;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+				     bool reserved)
+{
+	struct count_inflight_data *count_data = data;
+
+	/*
+	 * Can't check rq's state because it is updated to MQ_RQ_IN_FLIGHT
+	 * in blk_mq_start_request(), at that time we can't prevent this rq
+	 * from being issued.
+	 *
+	 * So check if driver tag is assigned, if yes, count this rq as
+	 * inflight.
+	 */
+	if (rq->tag >= 0 && rq->mq_hctx == count_data->hctx)
+		count_data->count++;
+
+	return true;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct count_inflight_data count_data = {
+		.count	= 0,
+		.hctx	= hctx,
+	};
+
+	blk_mq_all_tag_busy_iter(hctx->tags, blk_mq_count_inflight_rq,
+			&count_data);
+
+	return count_data.count;
+}
+
+static void blk_mq_hctx_drain_inflight_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	while (1) {
+		if (!blk_mq_tags_inflight_rqs(hctx))
+			break;
+		msleep(5);
+	}
+}
+
 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
 {
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+
+	if ((cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) == cpu) &&
+			(cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask)
+			 >= nr_cpu_ids)) {
+		/*
+		 * The current CPU is the last one in this hctx, S_INACTIVE
+		 * can be observed in dispatch path without any barrier needed,
+		 * cause both are run on one same CPU.
+		 */
+		set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+		blk_mq_hctx_drain_inflight_rqs(hctx);
+        }
 	return 0;
 }
 
@@ -2277,6 +2342,8 @@  static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	}
 	spin_unlock(&ctx->lock);
 
+	clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+
 	if (list_empty(&tmp))
 		return 0;