diff mbox series

[V10,10/11] blk-mq: re-submit IO in case that hctx is inactive

Message ID 20200505020930.1146281-11-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show
Series blk-mq: improvement CPU hotplug | expand

Commit Message

Ming Lei May 5, 2020, 2:09 a.m. UTC
When all CPUs in one hctx are offline and this hctx becomes inactive, we
shouldn't run this hw queue for completing request any more.

So allocate request from one live hctx, and clone & resubmit the request,
either it is from sw queue or scheduler queue.

Cc: John Garry <john.garry@huawei.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Tested-by: John Garry <john.garry@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 104 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 73e1a1d4c1c5..d639bcd89811 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2382,6 +2382,102 @@  static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
+static void blk_mq_resubmit_end_rq(struct request *rq, blk_status_t error)
+{
+	struct request *orig_rq = rq->end_io_data;
+
+	blk_mq_cleanup_rq(orig_rq);
+	blk_mq_end_request(orig_rq, error);
+
+	blk_put_request(rq);
+}
+
+static void blk_mq_resubmit_rq(struct request *rq)
+{
+	struct request *nrq;
+	unsigned int flags = 0;
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	struct blk_mq_tags *tags = rq->q->elevator ? hctx->sched_tags :
+		hctx->tags;
+	bool reserved = blk_mq_tag_is_reserved(tags, rq->internal_tag);
+
+	if (rq->rq_flags & RQF_PREEMPT)
+		flags |= BLK_MQ_REQ_PREEMPT;
+	if (reserved)
+		flags |= BLK_MQ_REQ_RESERVED;
+
+	/* avoid allocation failure by clearing NOWAIT */
+	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
+	if (!nrq)
+		return;
+
+	blk_rq_copy_request(nrq, rq);
+
+	nrq->timeout = rq->timeout;
+	nrq->rq_disk = rq->rq_disk;
+	nrq->part = rq->part;
+
+	memcpy(blk_mq_rq_to_pdu(nrq), blk_mq_rq_to_pdu(rq),
+			rq->q->tag_set->cmd_size);
+
+	nrq->end_io = blk_mq_resubmit_end_rq;
+	nrq->end_io_data = rq;
+	nrq->bio = rq->bio;
+	nrq->biotail = rq->biotail;
+
+	/* bios ownership has been transfered to new request */
+	rq->bio = rq->biotail = NULL;
+	rq->__data_len = 0;
+
+	if (blk_insert_cloned_request(nrq->q, nrq) != BLK_STS_OK)
+		blk_mq_request_bypass_insert(nrq, false, true);
+}
+
+static void blk_mq_hctx_deactivate(struct blk_mq_hw_ctx *hctx)
+{
+	LIST_HEAD(sched);
+	LIST_HEAD(re_submit);
+	LIST_HEAD(flush_in);
+	LIST_HEAD(flush_out);
+	struct request *rq, *nxt;
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (!e) {
+		blk_mq_flush_busy_ctxs(hctx, &re_submit);
+	} else {
+		while ((rq = e->type->ops.dispatch_request(hctx))) {
+			if (rq->mq_hctx != hctx)
+				list_add(&rq->queuelist, &sched);
+			else
+				list_add(&rq->queuelist, &re_submit);
+		}
+	}
+	while (!list_empty(&sched)) {
+		rq = list_first_entry(&sched, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_mq_sched_insert_request(rq, true, true, true);
+	}
+
+	/* requests in dispatch list have to be re-submitted too */
+	spin_lock(&hctx->lock);
+	list_splice_tail_init(&hctx->dispatch, &re_submit);
+	spin_unlock(&hctx->lock);
+
+	/* blk_end_flush_machinery will cover flush request */
+	list_for_each_entry_safe(rq, nxt, &re_submit, queuelist) {
+		if (rq->rq_flags & RQF_FLUSH_SEQ)
+			list_move(&rq->queuelist, &flush_in);
+	}
+	blk_end_flush_machinery(hctx, &flush_in, &flush_out);
+	list_splice_tail(&flush_out, &re_submit);
+
+	while (!list_empty(&re_submit)) {
+		rq = list_first_entry(&re_submit, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_mq_resubmit_rq(rq);
+	}
+}
+
 static void blk_mq_hctx_handle_dead_cpu(struct blk_mq_hw_ctx *hctx,
 		unsigned int cpu)
 {
@@ -2410,17 +2506,20 @@  static void blk_mq_hctx_handle_dead_cpu(struct blk_mq_hw_ctx *hctx,
 }
 
 /*
- * 'cpu' is going away. splice any existing rq_list entries from this
- * software queue to the hw queue dispatch list, and ensure that it
- * gets run.
+ * @cpu has gone away. If this hctx is inactive, we can't dispatch request
+ * to the hctx any more, so clone and re-submit requests from this hctx
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
 	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
 			struct blk_mq_hw_ctx, cpuhp_dead);
 
-	if (cpumask_test_cpu(cpu, hctx->cpumask))
-		blk_mq_hctx_handle_dead_cpu(hctx, cpu);
+	if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+		if (test_bit(BLK_MQ_S_INACTIVE, &hctx->state))
+			blk_mq_hctx_deactivate(hctx);
+		else
+			blk_mq_hctx_handle_dead_cpu(hctx, cpu);
+	}
 	return 0;
 }