diff mbox series

[-next,RFC,5/6] blk-mq: don't preempt tag expect for split bios

Message ID 20220329094048.2107094-6-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series improve large random io for HDD | expand

Commit Message

Yu Kuai March 29, 2022, 9:40 a.m. UTC
In order to improve the sequential of split io, this patch disables
tag preemption for the first split bios and other non-split bios if
the device is under high io pressure.

Noted that this solution rely on waitqueues of sbitmap to be balanced,
otherwise it may happen that 'wake_batch' tags is freed and wakers don't
obtain 'wake_batch' new tags, thus concurrent io will become less. The
next patch will avoid such problem, however, fix the unfairness of
waitqueues might be better.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-merge.c         |  7 ++++++-
 block/blk-mq-tag.c        | 37 ++++++++++++++++++++++++++-----------
 block/blk-mq.c            |  6 ++++++
 block/blk-mq.h            |  1 +
 include/linux/blk_types.h |  2 ++
 lib/sbitmap.c             | 14 ++++++++++----
 6 files changed, 51 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 340860746cac..fd4bbf773b45 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -357,6 +357,11 @@  static unsigned short blk_queue_split_all(struct request_queue *q,
 		if (!first)
 			first = split;
 
+		/*
+		 * Except the first split bio, others will always preempt
+		 * tag, so that they can be sequential.
+		 */
+		split->bi_opf |= REQ_PREEMPTIVE;
 		nr_split++;
 		submit_bio_noacct(split);
 	}
@@ -387,7 +392,7 @@  void __blk_queue_split(struct request_queue *q, struct bio **bio)
 
 	if (split) {
 		split->bi_nr_split = blk_queue_split_all(q, *bio);
-		(*bio)->bi_opf |= REQ_SPLIT;
+		(*bio)->bi_opf |= (REQ_SPLIT | REQ_PREEMPTIVE);
 		submit_bio_noacct(*bio);
 		*bio = split;
 	}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83dfbe2f1cfc..4e485bcc5820 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -127,6 +127,13 @@  unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 	return ret;
 }
 
+static inline bool preempt_tag(struct blk_mq_alloc_data *data,
+			       struct sbitmap_queue *bt)
+{
+	return data->preemption ||
+	       atomic_read(&bt->ws_active) <= SBQ_WAIT_QUEUES;
+}
+
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -148,12 +155,14 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		tag_offset = tags->nr_reserved_tags;
 	}
 
-	tag = __blk_mq_get_tag(data, bt);
-	if (tag != BLK_MQ_NO_TAG)
-		goto found_tag;
+	if (data->flags & BLK_MQ_REQ_NOWAIT || preempt_tag(data, bt)) {
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != BLK_MQ_NO_TAG)
+			goto found_tag;
 
-	if (data->flags & BLK_MQ_REQ_NOWAIT)
-		return BLK_MQ_NO_TAG;
+		if (data->flags & BLK_MQ_REQ_NOWAIT)
+			return BLK_MQ_NO_TAG;
+	}
 
 	wait.nr_tags += data->nr_split;
 	ws = bt_wait_ptr(bt, data->hctx);
@@ -171,20 +180,26 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != BLK_MQ_NO_TAG)
-			break;
+		if (preempt_tag(data, bt)) {
+			tag = __blk_mq_get_tag(data, bt);
+			if (tag != BLK_MQ_NO_TAG)
+				break;
+		}
 
 		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != BLK_MQ_NO_TAG)
-			break;
+		if (preempt_tag(data, bt)) {
+			tag = __blk_mq_get_tag(data, bt);
+			if (tag != BLK_MQ_NO_TAG)
+				break;
+		}
 
 		bt_prev = bt;
 		io_schedule();
 
 		sbitmap_finish_wait(bt, ws, &wait);
+		if (!blk_mq_is_tag_preemptive(data->hctx->flags))
+			data->preemption = true;
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9bace9e2c5ca..06ba6fa9ec1a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -470,6 +470,9 @@  static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 retry:
 	data->ctx = blk_mq_get_ctx(q);
 	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+	if (blk_mq_is_tag_preemptive(data->hctx->flags))
+		data->preemption = true;
+
 	if (!(data->rq_flags & RQF_ELV))
 		blk_mq_tag_busy(data->hctx);
 
@@ -577,6 +580,8 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	data.hctx = xa_load(&q->hctx_table, hctx_idx);
 	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
+	if (blk_mq_is_tag_preemptive(data.hctx->flags))
+		data.preemption = true;
 	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
 	data.ctx = __blk_mq_get_ctx(q, cpu);
 
@@ -2738,6 +2743,7 @@  static struct request *blk_mq_get_new_requests(struct request_queue *q,
 		.nr_tags	= 1,
 		.cmd_flags	= bio->bi_opf,
 		.nr_split	= bio->bi_nr_split,
+		.preemption	= (bio->bi_opf & REQ_PREEMPTIVE),
 	};
 	struct request *rq;
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3eabe394a5a9..915bb710dd6f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -157,6 +157,7 @@  struct blk_mq_alloc_data {
 	/* allocate multiple requests/tags in one go */
 	unsigned int nr_tags;
 	unsigned int nr_split;
+	bool preemption;
 	struct request **cached_rq;
 
 	/* input & output parameter */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 702f6b83dc88..8fd9756f0a06 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -419,6 +419,7 @@  enum req_flag_bits {
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
 	__REQ_SPLIT,		/* io is splitted */
+	__REQ_PREEMPTIVE,	/* io can preempt tag */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -444,6 +445,7 @@  enum req_flag_bits {
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)
 #define REQ_SPLIT		(1ULL << __REQ_SPLIT)
+#define REQ_PREEMPTIVE		(1ULL << __REQ_PREEMPTIVE)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 9d04c0ecc8f7..1655c15ee11d 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -597,7 +597,8 @@  static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 	return NULL;
 }
 
-static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags)
+static unsigned int get_wake_nr(struct sbq_wait_state *ws,
+				unsigned int *nr_tags)
 {
 	struct sbq_wait *wait;
 	struct wait_queue_entry *entry;
@@ -606,11 +607,13 @@  static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags)
 	spin_lock_irq(&ws->wait.lock);
 	list_for_each_entry(entry, &ws->wait.head, entry) {
 		wait = container_of(entry, struct sbq_wait, wait);
-		if (nr_tags <= wait->nr_tags)
+		if (*nr_tags <= wait->nr_tags) {
+			*nr_tags = 0;
 			break;
+		}
 
 		nr++;
-		nr_tags -= wait->nr_tags;
+		*nr_tags -= wait->nr_tags;
 	}
 	spin_unlock_irq(&ws->wait.lock);
 
@@ -648,7 +651,10 @@  static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 		ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
 		if (ret == wait_cnt) {
 			sbq_index_atomic_inc(&sbq->wake_index);
-			wake_up_nr(&ws->wait, get_wake_nr(ws, wake_batch));
+			wake_up_nr(&ws->wait, get_wake_nr(ws, &wake_batch));
+			if (wake_batch)
+				sbitmap_queue_wake_all(sbq);
+
 			return false;
 		}