[V11,11/12] blk-mq: re-submit IO in case that hctx is inactive
diff mbox series

Message ID 20200513034803.1844579-12-ming.lei@redhat.com
State New
Headers show
Series
  • blk-mq: improvement CPU hotplug
Related show

Commit Message

Ming Lei May 13, 2020, 3:48 a.m. UTC
When all CPUs in one hctx are offline and this hctx becomes inactive, we
shouldn't run this hw queue for completing request any more.

So allocate request from one live hctx, and clone & resubmit the request,
either it is from sw queue or scheduler queue.

Cc: John Garry <john.garry@huawei.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 111 insertions(+), 5 deletions(-)

Comments

John Garry May 13, 2020, 9:21 a.m. UTC | #1
On 13/05/2020 04:48, Ming Lei wrote:
> +static void blk_mq_resubmit_rq(struct request *rq)
> +{
> +	struct request *nrq;
> +	unsigned int flags = 0;
> +	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> +	struct blk_mq_tags *tags = rq->q->elevator ? hctx->sched_tags :
> +		hctx->tags;
> +	bool reserved = blk_mq_tag_is_reserved(tags, rq->internal_tag);
> +
> +	if (rq->rq_flags & RQF_PREEMPT)
> +		flags |= BLK_MQ_REQ_PREEMPT;
> +	if (reserved)
> +		flags |= BLK_MQ_REQ_RESERVED;
> +	/*
> +	 * Queue freezing might be in-progress, and wait freeze can't be
> +	 * done now because we have request not completed yet, so mark this
> +	 * allocation as BLK_MQ_REQ_FORCE for avoiding this allocation &
> +	 * freeze hung forever.
> +	 */
> +	flags |= BLK_MQ_REQ_FORCE;
> +

So setting this flag triggers this WARN:

[  101.308666] Modules linked in:
[  101.311710] CPU: 23 PID: 1491 Comm: bash Not tainted 
5.7.0-rc2-00106-g63430d85fea8 #337
[  101.319698] Hardware name: Huawei Taishan 2280 /D05, BIOS Hisilicon 
D05 IT21 Nemo 2.0 RC0 04/18/2018
[  101.328816] pstate: 60000005 (nZCv daif -PAN -UAO)
[  101.333593] pc : blk_get_request+0xa4/0xac
[  101.337676] lr : blk_get_request+0xa4/0xac
[  101.341758] sp : ffff800021773aa0
[  101.345059] x29: ffff800021773aa0 x28: 0000000000000004
[  101.350357] x27: 0000000000000004 x26: 0000000000000004
[  101.355655] x25: 0000000000000000 x24: ffff800010414b20
[  101.360953] x23: 0000000000000008 x22: ffff001fb0ecf900
[  101.366251] x21: ffff001fb0a42f40 x20: 0000000000004000
[  101.371549] x19: 0000000000000010 x18: 0000000000000000
[  101.376846] x17: 0000000000000000 x16: 0000000000000000
[  101.382144] x15: 0000000000000000 x14: 0000000000000000
[  101.387441] x13: 0000000000000000 x12: 0000000000000000
[  101.392739] x11: 000000000000064f x10: 0000000000000008
[  101.398036] x9 : ffff8000118f1c28 x8 : 2074736575716572
[  101.403334] x7 : 5f7465675f6b6c62 x6 : ffff041febee21d0
[  101.408632] x5 : 0000000000000000 x4 : 0000000000000000
[  101.413930] x3 : 0000000000000000 x2 : ffff041febee9080
[  101.419229] x1 : 0000000100000000 x0 : 000000000000001a
[  101.424527] Call trace:
[  101.426961]  blk_get_request+0xa4/0xac
[  101.430698]  blk_mq_hctx_deactivate+0x270/0x3e4
[  101.435215]  blk_mq_hctx_notify_dead+0x198/0x1b4
[  101.439821]  cpuhp_invoke_callback+0x170/0x1e0
[  101.444253]  _cpu_down+0x100/0x238
[  101.447642]  cpu_down+0x40/0x68
[  101.450770]  cpu_device_down+0x14/0x1c
[  101.454508]  cpu_subsys_offline+0xc/0x14
[  101.458417]  device_offline+0x98/0xc4
[  101.462065]  online_store+0x3c/0x88
[  101.465541]  dev_attr_store+0x14/0x24
[  101.469192]  sysfs_kf_write+0x44/0x4c
[  101.472840]  kernfs_fop_write+0xfc/0x208
[  101.476751]  __vfs_write+0x18/0x3c
[  101.480140]  vfs_write+0xb4/0x1b8
[  101.483442]  ksys_write+0x4c/0xac
[  101.486743]  __arm64_sys_write+0x1c/0x24
[  101.490654]  el0_svc_common.constprop.3+0xb8/0x170
[  101.495431]  do_el0_svc+0x70/0x88
[  101.498734]  el0_sync_handler+0xf0/0x12c
[  101.502643]  el0_sync+0x140/0x180
[  101.505944] ---[ end trace 137fed615521bd97 ]--

(the series tests ok, apart from that)

Thanks,
John

> +	/* avoid allocation failure by clearing NOWAIT */
> +	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
> +	if (!nrq)
> +		return;
Christoph Hellwig May 13, 2020, 12:21 p.m. UTC | #2
Use of the BLK_MQ_REQ_FORCE is pretty bogus here..

> +	if (rq->rq_flags & RQF_PREEMPT)
> +		flags |= BLK_MQ_REQ_PREEMPT;
> +	if (reserved)
> +		flags |= BLK_MQ_REQ_RESERVED;
> +	/*
> +	 * Queue freezing might be in-progress, and wait freeze can't be
> +	 * done now because we have request not completed yet, so mark this
> +	 * allocation as BLK_MQ_REQ_FORCE for avoiding this allocation &
> +	 * freeze hung forever.
> +	 */
> +	flags |= BLK_MQ_REQ_FORCE;
> +
> +	/* avoid allocation failure by clearing NOWAIT */
> +	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
> +	if (!nrq)
> +		return;

blk_get_request returns an ERR_PTR.

But I'd rather avoid the magic new BLK_MQ_REQ_FORCE hack when we can
just open code it and document what is going on:

static struct blk_mq_tags *blk_mq_rq_tags(struct request *rq)
{
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

	if (rq->q->elevator)
		return hctx->sched_tags;
	return hctx->tags;
}

static void blk_mq_resubmit_rq(struct request *rq)
{
	struct blk_mq_alloc_data alloc_data = {
		.cmd_flags	= rq->cmd_flags & ~REQ_NOWAIT;
	};
	struct request *nrq;

	if (rq->rq_flags & RQF_PREEMPT)
		alloc_data.flags |= BLK_MQ_REQ_PREEMPT;
	if (blk_mq_tag_is_reserved(blk_mq_rq_tags(rq), rq->internal_tag))
		alloc_data.flags |= BLK_MQ_REQ_RESERVED;

	/*
	 * We must still be able to finish a resubmission due to a hotplug
	 * even even if a queue freeze is in progress.
	 */
	percpu_ref_get(&q->q_usage_counter);
	nrq = blk_mq_get_request(rq->q, NULL, &alloc_data);
	blk_queue_exit(q);

	if (!nrq)
		return; // XXX: warn?
	if (nrq->q->mq_ops->initialize_rq_fn)
		rq->mq_ops->initialize_rq_fn(nrq);

	blk_rq_copy_request(nrq, rq);
	...
Bart Van Assche May 13, 2020, 3:03 p.m. UTC | #3
On 2020-05-13 05:21, Christoph Hellwig wrote:
> Use of the BLK_MQ_REQ_FORCE is pretty bogus here..
> 
>> +	if (rq->rq_flags & RQF_PREEMPT)
>> +		flags |= BLK_MQ_REQ_PREEMPT;
>> +	if (reserved)
>> +		flags |= BLK_MQ_REQ_RESERVED;
>> +	/*
>> +	 * Queue freezing might be in-progress, and wait freeze can't be
>> +	 * done now because we have request not completed yet, so mark this
>> +	 * allocation as BLK_MQ_REQ_FORCE for avoiding this allocation &
>> +	 * freeze hung forever.
>> +	 */
>> +	flags |= BLK_MQ_REQ_FORCE;
>> +
>> +	/* avoid allocation failure by clearing NOWAIT */
>> +	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
>> +	if (!nrq)
>> +		return;
> 
> blk_get_request returns an ERR_PTR.
> 
> But I'd rather avoid the magic new BLK_MQ_REQ_FORCE hack when we can
> just open code it and document what is going on:
> 
> static struct blk_mq_tags *blk_mq_rq_tags(struct request *rq)
> {
> 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> 
> 	if (rq->q->elevator)
> 		return hctx->sched_tags;
> 	return hctx->tags;
> }
> 
> static void blk_mq_resubmit_rq(struct request *rq)
> {
> 	struct blk_mq_alloc_data alloc_data = {
> 		.cmd_flags	= rq->cmd_flags & ~REQ_NOWAIT;
> 	};
> 	struct request *nrq;
> 
> 	if (rq->rq_flags & RQF_PREEMPT)
> 		alloc_data.flags |= BLK_MQ_REQ_PREEMPT;
> 	if (blk_mq_tag_is_reserved(blk_mq_rq_tags(rq), rq->internal_tag))
> 		alloc_data.flags |= BLK_MQ_REQ_RESERVED;
> 
> 	/*
> 	 * We must still be able to finish a resubmission due to a hotplug
> 	 * even even if a queue freeze is in progress.
> 	 */
> 	percpu_ref_get(&q->q_usage_counter);
> 	nrq = blk_mq_get_request(rq->q, NULL, &alloc_data);
> 	blk_queue_exit(q);
> 
> 	if (!nrq)
> 		return; // XXX: warn?
> 	if (nrq->q->mq_ops->initialize_rq_fn)
> 		rq->mq_ops->initialize_rq_fn(nrq);
> 
> 	blk_rq_copy_request(nrq, rq);
> 	...

I don't like this because the above code allows allocation of requests
and tags while a request queue is frozen. I'm concerned that this will
break code that assumes that no tags are allocated while a request queue
is frozen. If a request queue has a single hardware queue with 64 tags,
if the above code allocates tag 40 and if blk_mq_tag_update_depth()
reduces the queue depth to 32, will nrq become a dangling pointer?

Thanks,

Bart.
Ming Lei May 14, 2020, 12:40 a.m. UTC | #4
On Wed, May 13, 2020 at 02:21:47PM +0200, Christoph Hellwig wrote:
> Use of the BLK_MQ_REQ_FORCE is pretty bogus here..
> 
> > +	if (rq->rq_flags & RQF_PREEMPT)
> > +		flags |= BLK_MQ_REQ_PREEMPT;
> > +	if (reserved)
> > +		flags |= BLK_MQ_REQ_RESERVED;
> > +	/*
> > +	 * Queue freezing might be in-progress, and wait freeze can't be
> > +	 * done now because we have request not completed yet, so mark this
> > +	 * allocation as BLK_MQ_REQ_FORCE for avoiding this allocation &
> > +	 * freeze hung forever.
> > +	 */
> > +	flags |= BLK_MQ_REQ_FORCE;
> > +
> > +	/* avoid allocation failure by clearing NOWAIT */
> > +	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
> > +	if (!nrq)
> > +		return;
> 
> blk_get_request returns an ERR_PTR.
> 
> But I'd rather avoid the magic new BLK_MQ_REQ_FORCE hack when we can
> just open code it and document what is going on:

BLK_MQ_REQ_FORCE is actually not a hack, there are other use cases
which need that too, see commit log of patch 10/12.

> 
> static struct blk_mq_tags *blk_mq_rq_tags(struct request *rq)
> {
> 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> 
> 	if (rq->q->elevator)
> 		return hctx->sched_tags;
> 	return hctx->tags;
> }
> 
> static void blk_mq_resubmit_rq(struct request *rq)
> {
> 	struct blk_mq_alloc_data alloc_data = {
> 		.cmd_flags	= rq->cmd_flags & ~REQ_NOWAIT;
> 	};
> 	struct request *nrq;
> 
> 	if (rq->rq_flags & RQF_PREEMPT)
> 		alloc_data.flags |= BLK_MQ_REQ_PREEMPT;
> 	if (blk_mq_tag_is_reserved(blk_mq_rq_tags(rq), rq->internal_tag))
> 		alloc_data.flags |= BLK_MQ_REQ_RESERVED;
> 
> 	/*
> 	 * We must still be able to finish a resubmission due to a hotplug
> 	 * even even if a queue freeze is in progress.
> 	 */
> 	percpu_ref_get(&q->q_usage_counter);
> 	nrq = blk_mq_get_request(rq->q, NULL, &alloc_data);
> 	blk_queue_exit(q);

This way works too.

> 
> 	if (!nrq)
> 		return; // XXX: warn?

It isn't possible because we clears NO_WAIT flag.


Thanks, 
Ming
Ming Lei May 14, 2020, 12:45 a.m. UTC | #5
On Wed, May 13, 2020 at 08:03:13AM -0700, Bart Van Assche wrote:
> On 2020-05-13 05:21, Christoph Hellwig wrote:
> > Use of the BLK_MQ_REQ_FORCE is pretty bogus here..
> > 
> >> +	if (rq->rq_flags & RQF_PREEMPT)
> >> +		flags |= BLK_MQ_REQ_PREEMPT;
> >> +	if (reserved)
> >> +		flags |= BLK_MQ_REQ_RESERVED;
> >> +	/*
> >> +	 * Queue freezing might be in-progress, and wait freeze can't be
> >> +	 * done now because we have request not completed yet, so mark this
> >> +	 * allocation as BLK_MQ_REQ_FORCE for avoiding this allocation &
> >> +	 * freeze hung forever.
> >> +	 */
> >> +	flags |= BLK_MQ_REQ_FORCE;
> >> +
> >> +	/* avoid allocation failure by clearing NOWAIT */
> >> +	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
> >> +	if (!nrq)
> >> +		return;
> > 
> > blk_get_request returns an ERR_PTR.
> > 
> > But I'd rather avoid the magic new BLK_MQ_REQ_FORCE hack when we can
> > just open code it and document what is going on:
> > 
> > static struct blk_mq_tags *blk_mq_rq_tags(struct request *rq)
> > {
> > 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> > 
> > 	if (rq->q->elevator)
> > 		return hctx->sched_tags;
> > 	return hctx->tags;
> > }
> > 
> > static void blk_mq_resubmit_rq(struct request *rq)
> > {
> > 	struct blk_mq_alloc_data alloc_data = {
> > 		.cmd_flags	= rq->cmd_flags & ~REQ_NOWAIT;
> > 	};
> > 	struct request *nrq;
> > 
> > 	if (rq->rq_flags & RQF_PREEMPT)
> > 		alloc_data.flags |= BLK_MQ_REQ_PREEMPT;
> > 	if (blk_mq_tag_is_reserved(blk_mq_rq_tags(rq), rq->internal_tag))
> > 		alloc_data.flags |= BLK_MQ_REQ_RESERVED;
> > 
> > 	/*
> > 	 * We must still be able to finish a resubmission due to a hotplug
> > 	 * even even if a queue freeze is in progress.
> > 	 */
> > 	percpu_ref_get(&q->q_usage_counter);
> > 	nrq = blk_mq_get_request(rq->q, NULL, &alloc_data);
> > 	blk_queue_exit(q);
> > 
> > 	if (!nrq)
> > 		return; // XXX: warn?
> > 	if (nrq->q->mq_ops->initialize_rq_fn)
> > 		rq->mq_ops->initialize_rq_fn(nrq);
> > 
> > 	blk_rq_copy_request(nrq, rq);
> > 	...
> 
> I don't like this because the above code allows allocation of requests
> and tags while a request queue is frozen. I'm concerned that this will
> break code that assumes that no tags are allocated while a request queue
> is frozen. If a request queue has a single hardware queue with 64 tags,

The above code path will never be called for single hw queue.

> if the above code allocates tag 40 and if blk_mq_tag_update_depth()
> reduces the queue depth to 32, will nrq become a dangling pointer?

allocation for nrq is just like other normal allocation, and if
it doesn't work with blk_mq_tag_update_depth(), it must be a more
generic issue instead of relating with this specific use case.

The only difference is that 'nrq' will be allocated from a new active
hctx, so the two requests can co-exist and we needn't to worry deadlock.


thanks,
Ming

Patch
diff mbox series

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7c640482fb24..c9a3e48a1ebc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2402,6 +2402,109 @@  static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
+static void blk_mq_resubmit_end_rq(struct request *rq, blk_status_t error)
+{
+	struct request *orig_rq = rq->end_io_data;
+
+	blk_mq_cleanup_rq(orig_rq);
+	blk_mq_end_request(orig_rq, error);
+
+	blk_put_request(rq);
+}
+
+static void blk_mq_resubmit_rq(struct request *rq)
+{
+	struct request *nrq;
+	unsigned int flags = 0;
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	struct blk_mq_tags *tags = rq->q->elevator ? hctx->sched_tags :
+		hctx->tags;
+	bool reserved = blk_mq_tag_is_reserved(tags, rq->internal_tag);
+
+	if (rq->rq_flags & RQF_PREEMPT)
+		flags |= BLK_MQ_REQ_PREEMPT;
+	if (reserved)
+		flags |= BLK_MQ_REQ_RESERVED;
+	/*
+	 * Queue freezing might be in-progress, and wait freeze can't be
+	 * done now because we have request not completed yet, so mark this
+	 * allocation as BLK_MQ_REQ_FORCE for avoiding this allocation &
+	 * freeze hung forever.
+	 */
+	flags |= BLK_MQ_REQ_FORCE;
+
+	/* avoid allocation failure by clearing NOWAIT */
+	nrq = blk_get_request(rq->q, rq->cmd_flags & ~REQ_NOWAIT, flags);
+	if (!nrq)
+		return;
+
+	blk_rq_copy_request(nrq, rq);
+
+	nrq->timeout = rq->timeout;
+	nrq->rq_disk = rq->rq_disk;
+	nrq->part = rq->part;
+
+	memcpy(blk_mq_rq_to_pdu(nrq), blk_mq_rq_to_pdu(rq),
+			rq->q->tag_set->cmd_size);
+
+	nrq->end_io = blk_mq_resubmit_end_rq;
+	nrq->end_io_data = rq;
+	nrq->bio = rq->bio;
+	nrq->biotail = rq->biotail;
+
+	/* bios ownership has been transfered to new request */
+	rq->bio = rq->biotail = NULL;
+	rq->__data_len = 0;
+
+	if (blk_insert_cloned_request(nrq->q, nrq) != BLK_STS_OK)
+		blk_mq_request_bypass_insert(nrq, false, true);
+}
+
+static void blk_mq_hctx_deactivate(struct blk_mq_hw_ctx *hctx)
+{
+	LIST_HEAD(sched);
+	LIST_HEAD(re_submit);
+	LIST_HEAD(flush_in);
+	LIST_HEAD(flush_out);
+	struct request *rq, *nxt;
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (!e) {
+		blk_mq_flush_busy_ctxs(hctx, &re_submit);
+	} else {
+		while ((rq = e->type->ops.dispatch_request(hctx))) {
+			if (rq->mq_hctx != hctx)
+				list_add(&rq->queuelist, &sched);
+			else
+				list_add(&rq->queuelist, &re_submit);
+		}
+	}
+	while (!list_empty(&sched)) {
+		rq = list_first_entry(&sched, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_mq_sched_insert_request(rq, true, true, true);
+	}
+
+	/* requests in dispatch list have to be re-submitted too */
+	spin_lock(&hctx->lock);
+	list_splice_tail_init(&hctx->dispatch, &re_submit);
+	spin_unlock(&hctx->lock);
+
+	/* blk_end_flush_machinery will cover flush request */
+	list_for_each_entry_safe(rq, nxt, &re_submit, queuelist) {
+		if (rq->rq_flags & RQF_FLUSH_SEQ)
+			list_move(&rq->queuelist, &flush_in);
+	}
+	blk_end_flush_machinery(hctx, &flush_in, &flush_out);
+	list_splice_tail(&flush_out, &re_submit);
+
+	while (!list_empty(&re_submit)) {
+		rq = list_first_entry(&re_submit, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_mq_resubmit_rq(rq);
+	}
+}
+
 static void blk_mq_hctx_handle_dead_cpu(struct blk_mq_hw_ctx *hctx,
 		unsigned int cpu)
 {
@@ -2430,17 +2533,20 @@  static void blk_mq_hctx_handle_dead_cpu(struct blk_mq_hw_ctx *hctx,
 }
 
 /*
- * 'cpu' is going away. splice any existing rq_list entries from this
- * software queue to the hw queue dispatch list, and ensure that it
- * gets run.
+ * @cpu has gone away. If this hctx is inactive, we can't dispatch request
+ * to the hctx any more, so clone and re-submit requests from this hctx
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
 	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
 			struct blk_mq_hw_ctx, cpuhp_dead);
 
-	if (cpumask_test_cpu(cpu, hctx->cpumask))
-		blk_mq_hctx_handle_dead_cpu(hctx, cpu);
+	if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+		if (test_bit(BLK_MQ_S_INACTIVE, &hctx->state))
+			blk_mq_hctx_deactivate(hctx);
+		else
+			blk_mq_hctx_handle_dead_cpu(hctx, cpu);
+	}
 	return 0;
 }