@@ -357,6 +357,11 @@ static unsigned short blk_queue_split_all(struct request_queue *q,
if (!first)
first = split;
+ /*
+ * Except the first split bio, others will always preempt
+ * tag, so that they can be sequential.
+ */
+ split->bi_opf |= REQ_PREEMPTIVE;
nr_split++;
submit_bio_noacct(split);
}
@@ -387,7 +392,7 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio)
if (split) {
split->bi_nr_split = blk_queue_split_all(q, *bio);
- (*bio)->bi_opf |= REQ_SPLIT;
+ (*bio)->bi_opf |= (REQ_SPLIT | REQ_PREEMPTIVE);
submit_bio_noacct(*bio);
*bio = split;
}
@@ -127,6 +127,13 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
return ret;
}
+static inline bool preempt_tag(struct blk_mq_alloc_data *data,
+ struct sbitmap_queue *bt)
+{
+ return data->preemption ||
+ atomic_read(&bt->ws_active) <= SBQ_WAIT_QUEUES;
+}
+
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -148,12 +155,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
tag_offset = tags->nr_reserved_tags;
}
- tag = __blk_mq_get_tag(data, bt);
- if (tag != BLK_MQ_NO_TAG)
- goto found_tag;
+ if (data->flags & BLK_MQ_REQ_NOWAIT || preempt_tag(data, bt)) {
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != BLK_MQ_NO_TAG)
+ goto found_tag;
- if (data->flags & BLK_MQ_REQ_NOWAIT)
- return BLK_MQ_NO_TAG;
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
+ return BLK_MQ_NO_TAG;
+ }
wait.nr_tags += data->nr_split;
ws = bt_wait_ptr(bt, data->hctx);
@@ -171,20 +180,26 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
- tag = __blk_mq_get_tag(data, bt);
- if (tag != BLK_MQ_NO_TAG)
- break;
+ if (preempt_tag(data, bt)) {
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != BLK_MQ_NO_TAG)
+ break;
+ }
sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
- tag = __blk_mq_get_tag(data, bt);
- if (tag != BLK_MQ_NO_TAG)
- break;
+ if (preempt_tag(data, bt)) {
+ tag = __blk_mq_get_tag(data, bt);
+ if (tag != BLK_MQ_NO_TAG)
+ break;
+ }
bt_prev = bt;
io_schedule();
sbitmap_finish_wait(bt, ws, &wait);
+ if (!blk_mq_is_tag_preemptive(data->hctx->flags))
+ data->preemption = true;
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
@@ -470,6 +470,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+ if (blk_mq_is_tag_preemptive(data->hctx->flags))
+ data->preemption = true;
+
if (!(data->rq_flags & RQF_ELV))
blk_mq_tag_busy(data->hctx);
@@ -577,6 +580,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
data.hctx = xa_load(&q->hctx_table, hctx_idx);
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
+ if (blk_mq_is_tag_preemptive(data.hctx->flags))
+ data.preemption = true;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
@@ -2738,6 +2743,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
.nr_tags = 1,
.cmd_flags = bio->bi_opf,
.nr_split = bio->bi_nr_split,
+ .preemption = (bio->bi_opf & REQ_PREEMPTIVE),
};
struct request *rq;
@@ -157,6 +157,7 @@ struct blk_mq_alloc_data {
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
unsigned int nr_split;
+ bool preemption;
struct request **cached_rq;
/* input & output parameter */
@@ -419,6 +419,7 @@ enum req_flag_bits {
__REQ_DRV,
__REQ_SWAP, /* swapping request. */
__REQ_SPLIT, /* io is splitted */
+ __REQ_PREEMPTIVE, /* io can preempt tag */
__REQ_NR_BITS, /* stops here */
};
@@ -444,6 +445,7 @@ enum req_flag_bits {
#define REQ_DRV (1ULL << __REQ_DRV)
#define REQ_SWAP (1ULL << __REQ_SWAP)
#define REQ_SPLIT (1ULL << __REQ_SPLIT)
+#define REQ_PREEMPTIVE (1ULL << __REQ_PREEMPTIVE)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -597,7 +597,8 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
return NULL;
}
-static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags)
+static unsigned int get_wake_nr(struct sbq_wait_state *ws,
+ unsigned int *nr_tags)
{
struct sbq_wait *wait;
struct wait_queue_entry *entry;
@@ -606,11 +607,13 @@ static unsigned int get_wake_nr(struct sbq_wait_state *ws, unsigned int nr_tags)
spin_lock_irq(&ws->wait.lock);
list_for_each_entry(entry, &ws->wait.head, entry) {
wait = container_of(entry, struct sbq_wait, wait);
- if (nr_tags <= wait->nr_tags)
+ if (*nr_tags <= wait->nr_tags) {
+ *nr_tags = 0;
break;
+ }
nr++;
- nr_tags -= wait->nr_tags;
+ *nr_tags -= wait->nr_tags;
}
spin_unlock_irq(&ws->wait.lock);
@@ -648,7 +651,10 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
if (ret == wait_cnt) {
sbq_index_atomic_inc(&sbq->wake_index);
- wake_up_nr(&ws->wait, get_wake_nr(ws, wake_batch));
+ wake_up_nr(&ws->wait, get_wake_nr(ws, &wake_batch));
+ if (wake_batch)
+ sbitmap_queue_wake_all(sbq);
+
return false;
}
In order to improve the sequential of split io, this patch disables tag preemption for the first split bios and other non-split bios if the device is under high io pressure. Noted that this solution rely on waitqueues of sbitmap to be balanced, otherwise it may happen that 'wake_batch' tags is freed and wakers don't obtain 'wake_batch' new tags, thus concurrent io will become less. The next patch will avoid such problem, however, fix the unfairness of waitqueues might be better. Signed-off-by: Yu Kuai <yukuai3@huawei.com> --- block/blk-merge.c | 7 ++++++- block/blk-mq-tag.c | 37 ++++++++++++++++++++++++++----------- block/blk-mq.c | 6 ++++++ block/blk-mq.h | 1 + include/linux/blk_types.h | 2 ++ lib/sbitmap.c | 14 ++++++++++---- 6 files changed, 51 insertions(+), 16 deletions(-)