blk-mq: don't fail driver tag allocation because of inactive hctx
diff mbox series

Message ID 20200603105128.2147139-1-ming.lei@redhat.com
State New
Headers show
Series
  • blk-mq: don't fail driver tag allocation because of inactive hctx
Related show

Commit Message

Ming Lei June 3, 2020, 10:51 a.m. UTC
Commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
prevents new request from being allocated on hctx which is going to be inactive,
meantime drains all in-queue requests.

We needn't to prevent driver tag from being allocated during cpu hotplug, more
importantly we have to provide driver tag for requests, so that the cpu hotplug
handling can move on. blk_mq_get_tag() is shared for allocating both internal
tag and drive tag, so driver tag allocation may fail because the hctx is
marked as inactive.

Fix the issue by moving BLK_MQ_S_INACTIVE check to __blk_mq_alloc_request().

Cc: Dongli Zhang <dongli.zhang@oracle.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Daniel Wagner <dwagner@suse.de>
Fixes: bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c |  8 --------
 block/blk-mq.c     | 27 ++++++++++++++++++++-------
 2 files changed, 20 insertions(+), 15 deletions(-)

Comments

Christoph Hellwig June 3, 2020, 11:53 a.m. UTC | #1
On Wed, Jun 03, 2020 at 06:51:27PM +0800, Ming Lei wrote:
> Commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
> prevents new request from being allocated on hctx which is going to be inactive,
> meantime drains all in-queue requests.
> 
> We needn't to prevent driver tag from being allocated during cpu hotplug, more
> importantly we have to provide driver tag for requests, so that the cpu hotplug
> handling can move on. blk_mq_get_tag() is shared for allocating both internal
> tag and drive tag, so driver tag allocation may fail because the hctx is
> marked as inactive.
> 
> Fix the issue by moving BLK_MQ_S_INACTIVE check to __blk_mq_alloc_request().

This looks correct, but a little ugly.  How about we resurrect my
patch to split off blk_mq_get_driver_tag entirely?  Quick untested rebase
below, still needs a better changelog and fixes tg:

---
From e432011e2eb5ac7bd1046bbf936645e9f7b74e64 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 May 2020 08:03:48 +0200
Subject: blk-mq: split out a __blk_mq_get_driver_tag helper

Allocation of the driver tag in the case of using a scheduler shares very
little code with the "normal" tag allocation.  Split out a new helper to
streamline this path, and untangle it from the complex normal tag
allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c | 27 +++++++++++++++++++++++++++
 block/blk-mq-tag.h |  8 ++++++++
 block/blk-mq.c     | 29 -----------------------------
 block/blk-mq.h     |  1 -
 4 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 96a39d0724a29..cded7fdcad8ef 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -191,6 +191,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return tag + tag_offset;
 }
 
+bool __blk_mq_get_driver_tag(struct request *rq)
+{
+	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+	bool shared = blk_mq_tag_busy(rq->mq_hctx);
+	int tag;
+
+	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+		bt = &rq->mq_hctx->tags->breserved_tags;
+		tag_offset = 0;
+	}
+
+	if (!hctx_may_queue(rq->mq_hctx, bt))
+		return false;
+	tag = __sbitmap_queue_get(bt);
+	if (tag == BLK_MQ_NO_TAG)
+		return false;
+
+	rq->tag = tag + tag_offset;
+	if (shared) {
+		rq->rq_flags |= RQF_MQ_INFLIGHT;
+		atomic_inc(&rq->mq_hctx->nr_active);
+	}
+	rq->mq_hctx->tags->rqs[rq->tag] = rq;
+	return true;
+}
+
 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 		    unsigned int tag)
 {
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d38e48f2a0a4a..2e4ef51cdb32a 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -51,6 +51,14 @@ enum {
 	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
 };
 
+bool __blk_mq_get_driver_tag(struct request *rq);
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+	if (rq->tag != BLK_MQ_NO_TAG)
+		return true;
+	return __blk_mq_get_driver_tag(rq);
+}
+
 extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9a36ac1c1fa1d..4f57d27bfa737 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1052,35 +1052,6 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
-bool blk_mq_get_driver_tag(struct request *rq)
-{
-	struct blk_mq_alloc_data data = {
-		.q = rq->q,
-		.hctx = rq->mq_hctx,
-		.flags = BLK_MQ_REQ_NOWAIT,
-		.cmd_flags = rq->cmd_flags,
-	};
-	bool shared;
-
-	if (rq->tag != BLK_MQ_NO_TAG)
-		return true;
-
-	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
-		data.flags |= BLK_MQ_REQ_RESERVED;
-
-	shared = blk_mq_tag_busy(data.hctx);
-	rq->tag = blk_mq_get_tag(&data);
-	if (rq->tag >= 0) {
-		if (shared) {
-			rq->rq_flags |= RQF_MQ_INFLIGHT;
-			atomic_inc(&data.hctx->nr_active);
-		}
-		data.hctx->tags->rqs[rq->tag] = rq;
-	}
-
-	return rq->tag != BLK_MQ_NO_TAG;
-}
-
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 				int flags, void *key)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a139b06318174..b3ce0f3a2ad2a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -44,7 +44,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 				bool kick_requeue_list);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq);
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_ctx *start);
Ming Lei June 3, 2020, 1:36 p.m. UTC | #2
On Wed, Jun 03, 2020 at 01:53:47PM +0200, Christoph Hellwig wrote:
> On Wed, Jun 03, 2020 at 06:51:27PM +0800, Ming Lei wrote:
> > Commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
> > prevents new request from being allocated on hctx which is going to be inactive,
> > meantime drains all in-queue requests.
> > 
> > We needn't to prevent driver tag from being allocated during cpu hotplug, more
> > importantly we have to provide driver tag for requests, so that the cpu hotplug
> > handling can move on. blk_mq_get_tag() is shared for allocating both internal
> > tag and drive tag, so driver tag allocation may fail because the hctx is
> > marked as inactive.
> > 
> > Fix the issue by moving BLK_MQ_S_INACTIVE check to __blk_mq_alloc_request().
> 
> This looks correct, but a little ugly.  How about we resurrect my
> patch to split off blk_mq_get_driver_tag entirely?  Quick untested rebase

OK, I am fine.

> below, still needs a better changelog and fixes tg:
> 
> ---
> From e432011e2eb5ac7bd1046bbf936645e9f7b74e64 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Sat, 16 May 2020 08:03:48 +0200
> Subject: blk-mq: split out a __blk_mq_get_driver_tag helper
> 
> Allocation of the driver tag in the case of using a scheduler shares very
> little code with the "normal" tag allocation.  Split out a new helper to
> streamline this path, and untangle it from the complex normal tag
> allocation.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-mq-tag.c | 27 +++++++++++++++++++++++++++
>  block/blk-mq-tag.h |  8 ++++++++
>  block/blk-mq.c     | 29 -----------------------------
>  block/blk-mq.h     |  1 -
>  4 files changed, 35 insertions(+), 30 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 96a39d0724a29..cded7fdcad8ef 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -191,6 +191,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  	return tag + tag_offset;
>  }
>  
> +bool __blk_mq_get_driver_tag(struct request *rq)
> +{
> +	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
> +	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
> +	bool shared = blk_mq_tag_busy(rq->mq_hctx);

Not necessary to add 'shared' which is just used once.

> +	int tag;
> +
> +	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
> +		bt = &rq->mq_hctx->tags->breserved_tags;

Too many rq->mq_hctx->tags, you can add one local variable to store it.

Otherwise, looks fine:

Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thanks, 
Ming
John Garry June 3, 2020, 4:16 p.m. UTC | #3
On 03/06/2020 14:36, Ming Lei wrote:
> On Wed, Jun 03, 2020 at 01:53:47PM +0200, Christoph Hellwig wrote:
>> On Wed, Jun 03, 2020 at 06:51:27PM +0800, Ming Lei wrote:
>>> Commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
>>> prevents new request from being allocated on hctx which is going to be inactive,
>>> meantime drains all in-queue requests.
>>>
>>> We needn't to prevent driver tag from being allocated during cpu hotplug, more
>>> importantly we have to provide driver tag for requests, so that the cpu hotplug
>>> handling can move on. blk_mq_get_tag() is shared for allocating both internal
>>> tag and drive tag, so driver tag allocation may fail because the hctx is
>>> marked as inactive.
>>>
>>> Fix the issue by moving BLK_MQ_S_INACTIVE check to __blk_mq_alloc_request().
>>
>> This looks correct, but a little ugly.  How about we resurrect my
>> patch to split off blk_mq_get_driver_tag entirely?  Quick untested rebase
> 
> OK, I am fine.
> 
>> below, still needs a better changelog and fixes tg:
>>
>> ---
>>  From e432011e2eb5ac7bd1046bbf936645e9f7b74e64 Mon Sep 17 00:00:00 2001
>> From: Christoph Hellwig <hch@lst.de>
>> Date: Sat, 16 May 2020 08:03:48 +0200
>> Subject: blk-mq: split out a __blk_mq_get_driver_tag helper
>>
>> Allocation of the driver tag in the case of using a scheduler shares very
>> little code with the "normal" tag allocation.  Split out a new helper to
>> streamline this path, and untangle it from the complex normal tag
>> allocation.
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>> ---
>>   block/blk-mq-tag.c | 27 +++++++++++++++++++++++++++
>>   block/blk-mq-tag.h |  8 ++++++++
>>   block/blk-mq.c     | 29 -----------------------------
>>   block/blk-mq.h     |  1 -
>>   4 files changed, 35 insertions(+), 30 deletions(-)
>>
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index 96a39d0724a29..cded7fdcad8ef 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -191,6 +191,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>>   	return tag + tag_offset;
>>   }
>>   
>> +bool __blk_mq_get_driver_tag(struct request *rq)
>> +{
>> +	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
>> +	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
>> +	bool shared = blk_mq_tag_busy(rq->mq_hctx);
> 
> Not necessary to add 'shared' which is just used once.
> 
>> +	int tag;
>> +
>> +	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
>> +		bt = &rq->mq_hctx->tags->breserved_tags;
> 
> Too many rq->mq_hctx->tags, you can add one local variable to store it.
> 
> Otherwise, looks fine:
> 
> Reviewed-by: Ming Lei <ming.lei@redhat.com>

So I did see this for mq-deadline scheduler:

] Unable to handle kernel NULL pointer dereference at virtual address 
0000000000000040
[  518.688402] Mem abort info:
[  518.691183]ESR = 0x96000004
[  518.694233]EC = 0x25: DABT (current EL), IL = 32 bits
[  518.699568]SET = 0, FnV = 0
[  518.702667] irq_shutdown irq143
[  518.705937] CPU43: shutdown
[  518.706093]EA = 0, S1PTW = 0
[  518.708733] psci: CPU43 killed (polled 0 ms)
[  518.716141] Data abort info:
[  518.719050]ISV = 0, ISS = 0x00000004
[  518.722892]CM = 0, WnR = 0
[  518.725867] user pgtable: 4k pages, 48-bit VAs, pgdp=0000041fa4eae000
[  518.732318] [0000000000000040] pgd=0000000000000000
[  518.737208] Internal error: Oops: 96000004 [#1] PREEMPT SMP
[  518.742768] Modules linked in:
[  518.745812] CPU: 0 PID: 1309 Comm: kworker/43:1H Not tainted 
5.7.0-rc2-00125-gf498b5de8a1c #390
[  518.754496] Hardware name: Huawei Taishan 2280 /D05, BIOS Hisilicon 
D05 IT21 Nemo 2.0 RC0 04/18/2018
[  518.763624] Workqueue: kblockd blk_mq_run_work_fn
[  518.768315] pstate: 80000005 (Nzcv daif -PAN -UAO)
[  518.773094] pc : blk_mq_put_tag+0x2c/0x68
[  518.777091] lr : blk_mq_get_tag+0x2ec/0x338
[  518.781260] sp : ffff800021a53ab0
[  518.784562] x29: ffff800021a53ab0 x28: ffff041f9e81d248
[  518.789861] x27: 00000000000002aa x26: ffff041fa3e49300
[  518.795159] x25: ffff001fb0178fc0 x24: ffff800021a53c68
[  518.800458] x23: 0000000000000000 x22: 0000000000000000
[  518.805756] x21: ffff800011a69000 x20: ffff041fa3e49310
[  518.811055] x19: ffff800021a53b90 x18: 0000000000000004
[  518.816354] x17: 0000000000000000 x16: 0000000000000000
[  518.821652] x15: 0000000000000000 x14: 00000000000000ef
[  518.826951] x13: 00000000000000ef x12: 0000000000000001
[  518.832249] x11: 0000000000000000 x10: 00000000000009c0
[  518.837548] x9 : ffff800021a53d50 x8 : ffff041f9e39a620
[  518.842847] x7 : fefefefefefefeff x6 : 0000000000000000
[  518.848146] x5 : 400007ffffffffff x4 : 0000000000001000
[  518.853445] x3 : 00000000000002aa x2 : 0000000000000000
[  518.858743] x1 : 0000000000000000 x0 : ffff041fa3e49300
[  518.864042] Call trace:
[  518.866477]  blk_mq_put_tag+0x2c/0x68
[  518.870126]  blk_mq_get_tag+0x2ec/0x338
[  518.873948]  blk_mq_get_driver_tag+0xf8/0x168
[  518.878292]  blk_mq_dispatch_rq_list+0x118/0x670
[  518.882896]  blk_mq_do_dispatch_sched+0xb0/0x148
[  518.887500]  __blk_mq_sched_dispatch_requests+0xec/0x1d8
[  518.892799]  blk_mq_sched_dispatch_requests+0x3c/0x78
[  518.897837]  __blk_mq_run_hw_queue+0xb4/0x130
[  518.902181]  blk_mq_run_work_fn+0x20/0x30
[  518.906179]  process_one_work+0x1e8/0x360
[  518.910176]  worker_thread+0x44/0x488
[  518.913825]  kthread+0x150/0x158
[  518.917041]  ret_from_fork+0x10/0x1c
[  518.920606] Code: b9400004 4b020063 6b04007f 540001e2 (b9404022)

With Christoph's patch and "blk-mq: get ctx in order to handle 
BLK_MQ_S_INACTIVE in blk_mq_get_tag()" [I didn't test without that 
patch], no issue detected so far.

Thanks,
John
Christoph Hellwig June 3, 2020, 6:12 p.m. UTC | #4
On Wed, Jun 03, 2020 at 09:36:08PM +0800, Ming Lei wrote:
> > +bool __blk_mq_get_driver_tag(struct request *rq)
> > +{
> > +	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
> > +	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
> > +	bool shared = blk_mq_tag_busy(rq->mq_hctx);
> 
> Not necessary to add 'shared' which is just used once.

blk_mq_tag_busy also increments active_queues, and the existing code
does that before various early returns.  To keep the behavior as-is
the call to blk_mq_tag_busy can't be moved around.

> > +	int tag;
> > +
> > +	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
> > +		bt = &rq->mq_hctx->tags->breserved_tags;
> 
> Too many rq->mq_hctx->tags, you can add one local variable to store it.

Really just three of them.  And with a local variable confusing it with
rq->mq_hctx->sched_tags becomes much easier.
Dongli Zhang June 4, 2020, 5:22 a.m. UTC | #5
On 6/3/20 9:16 AM, John Garry wrote:
> On 03/06/2020 14:36, Ming Lei wrote:
>> On Wed, Jun 03, 2020 at 01:53:47PM +0200, Christoph Hellwig wrote:
>>> On Wed, Jun 03, 2020 at 06:51:27PM +0800, Ming Lei wrote:
>>>> Commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
>>>> prevents new request from being allocated on hctx which is going to be
>>>> inactive,
>>>> meantime drains all in-queue requests.
>>>>
>>>> We needn't to prevent driver tag from being allocated during cpu hotplug, more
>>>> importantly we have to provide driver tag for requests, so that the cpu hotplug
>>>> handling can move on. blk_mq_get_tag() is shared for allocating both internal
>>>> tag and drive tag, so driver tag allocation may fail because the hctx is
>>>> marked as inactive.
>>>>
>>>> Fix the issue by moving BLK_MQ_S_INACTIVE check to __blk_mq_alloc_request().
>>>
>>> This looks correct, but a little ugly.  How about we resurrect my
>>> patch to split off blk_mq_get_driver_tag entirely?  Quick untested rebase
>>
>> OK, I am fine.
>>
>>> below, still needs a better changelog and fixes tg:
>>>
>>> ---
>>>  From e432011e2eb5ac7bd1046bbf936645e9f7b74e64 Mon Sep 17 00:00:00 2001
>>> From: Christoph Hellwig <hch@lst.de>
>>> Date: Sat, 16 May 2020 08:03:48 +0200
>>> Subject: blk-mq: split out a __blk_mq_get_driver_tag helper
>>>
>>> Allocation of the driver tag in the case of using a scheduler shares very
>>> little code with the "normal" tag allocation.  Split out a new helper to
>>> streamline this path, and untangle it from the complex normal tag
>>> allocation.
>>>
>>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>>> ---
>>>   block/blk-mq-tag.c | 27 +++++++++++++++++++++++++++
>>>   block/blk-mq-tag.h |  8 ++++++++
>>>   block/blk-mq.c     | 29 -----------------------------
>>>   block/blk-mq.h     |  1 -
>>>   4 files changed, 35 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>> index 96a39d0724a29..cded7fdcad8ef 100644
>>> --- a/block/blk-mq-tag.c
>>> +++ b/block/blk-mq-tag.c
>>> @@ -191,6 +191,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>>>       return tag + tag_offset;
>>>   }
>>>   +bool __blk_mq_get_driver_tag(struct request *rq)
>>> +{
>>> +    struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
>>> +    unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
>>> +    bool shared = blk_mq_tag_busy(rq->mq_hctx);
>>
>> Not necessary to add 'shared' which is just used once.
>>
>>> +    int tag;
>>> +
>>> +    if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
>>> +        bt = &rq->mq_hctx->tags->breserved_tags;
>>
>> Too many rq->mq_hctx->tags, you can add one local variable to store it.
>>
>> Otherwise, looks fine:
>>
>> Reviewed-by: Ming Lei <ming.lei@redhat.com>
> 
> So I did see this for mq-deadline scheduler:
> 
> ] Unable to handle kernel NULL pointer dereference at virtual address
> 0000000000000040
> [  518.688402] Mem abort info:
> [  518.691183]ESR = 0x96000004
> [  518.694233]EC = 0x25: DABT (current EL), IL = 32 bits
> [  518.699568]SET = 0, FnV = 0
> [  518.702667] irq_shutdown irq143
> [  518.705937] CPU43: shutdown
> [  518.706093]EA = 0, S1PTW = 0
> [  518.708733] psci: CPU43 killed (polled 0 ms)
> [  518.716141] Data abort info:
> [  518.719050]ISV = 0, ISS = 0x00000004
> [  518.722892]CM = 0, WnR = 0
> [  518.725867] user pgtable: 4k pages, 48-bit VAs, pgdp=0000041fa4eae000
> [  518.732318] [0000000000000040] pgd=0000000000000000
> [  518.737208] Internal error: Oops: 96000004 [#1] PREEMPT SMP
> [  518.742768] Modules linked in:
> [  518.745812] CPU: 0 PID: 1309 Comm: kworker/43:1H Not tainted
> 5.7.0-rc2-00125-gf498b5de8a1c #390
> [  518.754496] Hardware name: Huawei Taishan 2280 /D05, BIOS Hisilicon D05 IT21
> Nemo 2.0 RC0 04/18/2018
> [  518.763624] Workqueue: kblockd blk_mq_run_work_fn
> [  518.768315] pstate: 80000005 (Nzcv daif -PAN -UAO)
> [  518.773094] pc : blk_mq_put_tag+0x2c/0x68
> [  518.777091] lr : blk_mq_get_tag+0x2ec/0x338
> [  518.781260] sp : ffff800021a53ab0
> [  518.784562] x29: ffff800021a53ab0 x28: ffff041f9e81d248
> [  518.789861] x27: 00000000000002aa x26: ffff041fa3e49300
> [  518.795159] x25: ffff001fb0178fc0 x24: ffff800021a53c68
> [  518.800458] x23: 0000000000000000 x22: 0000000000000000
> [  518.805756] x21: ffff800011a69000 x20: ffff041fa3e49310
> [  518.811055] x19: ffff800021a53b90 x18: 0000000000000004
> [  518.816354] x17: 0000000000000000 x16: 0000000000000000
> [  518.821652] x15: 0000000000000000 x14: 00000000000000ef
> [  518.826951] x13: 00000000000000ef x12: 0000000000000001
> [  518.832249] x11: 0000000000000000 x10: 00000000000009c0
> [  518.837548] x9 : ffff800021a53d50 x8 : ffff041f9e39a620
> [  518.842847] x7 : fefefefefefefeff x6 : 0000000000000000
> [  518.848146] x5 : 400007ffffffffff x4 : 0000000000001000
> [  518.853445] x3 : 00000000000002aa x2 : 0000000000000000
> [  518.858743] x1 : 0000000000000000 x0 : ffff041fa3e49300
> [  518.864042] Call trace:
> [  518.866477]  blk_mq_put_tag+0x2c/0x68
> [  518.870126]  blk_mq_get_tag+0x2ec/0x338
> [  518.873948]  blk_mq_get_driver_tag+0xf8/0x168
> [  518.878292]  blk_mq_dispatch_rq_list+0x118/0x670
> [  518.882896]  blk_mq_do_dispatch_sched+0xb0/0x148
> [  518.887500]  __blk_mq_sched_dispatch_requests+0xec/0x1d8
> [  518.892799]  blk_mq_sched_dispatch_requests+0x3c/0x78
> [  518.897837]  __blk_mq_run_hw_queue+0xb4/0x130
> [  518.902181]  blk_mq_run_work_fn+0x20/0x30
> [  518.906179]  process_one_work+0x1e8/0x360
> [  518.910176]  worker_thread+0x44/0x488
> [  518.913825]  kthread+0x150/0x158
> [  518.917041]  ret_from_fork+0x10/0x1c
> [  518.920606] Code: b9400004 4b020063 6b04007f 540001e2 (b9404022)
> 
> With Christoph's patch and "blk-mq: get ctx in order to handle BLK_MQ_S_INACTIVE
> in blk_mq_get_tag()" [I didn't test without that patch], no issue detected so far.
> 

I think my patch is no longer required with Christoph's patch.

Dongli Zhang

Dongli Zhang
John Garry June 4, 2020, 10:24 a.m. UTC | #6
>>>>
>>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>>> index 96a39d0724a29..cded7fdcad8ef 100644
>>>> --- a/block/blk-mq-tag.c
>>>> +++ b/block/blk-mq-tag.c
>>>> @@ -191,6 +191,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>>>>        return tag + tag_offset;
>>>>    }
>>>>    +bool __blk_mq_get_driver_tag(struct request *rq)
>>>> +{
>>>> +    struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
>>>> +    unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
>>>> +    bool shared = blk_mq_tag_busy(rq->mq_hctx);
>>>
>>> Not necessary to add 'shared' which is just used once.
>>>
>>>> +    int tag;
>>>> +
>>>> +    if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
>>>> +        bt = &rq->mq_hctx->tags->breserved_tags;
>>>
>>> Too many rq->mq_hctx->tags, you can add one local variable to store it.
>>>
>>> Otherwise, looks fine:
>>>
>>> Reviewed-by: Ming Lei <ming.lei@redhat.com>

Unfortunately this looks to still have problems. So, for a start, this 
stackframe is pumped out a lot (which I removed, for below):

[  697.464058] Workqueue: kblockd blk_mq_run_work_fn
[  697.468749] Call trace:
[  697.471184]  dump_backtrace+0x0/0x1c0
[  697.474833]  show_stack+0x18/0x28
[  697.478136]  dump_stack+0xc8/0x114
[  697.481524]  __blk_mq_run_hw_queue+0x124/0x130
[  697.485954]  blk_mq_run_work_fn+0x20/0x30
[  697.489951]  process_one_work+0x1e8/0x360
[  697.493947]  worker_thread+0x238/0x478
[  697.497683]  kthread+0x150/0x158
[  697.500898]  ret_from_fork+0x10/0x1c
[  697.504480] run queue from wrong CPU 32, hctx active
[  697.509444] CPU: 32 PID: 1477 Comm: kworker/21:2H Not tainted 5.7.0-

And then other time I get SCSI timeouts:

[95.170616] psci: CPU21 killed (polled 0 ms)
[95.419832] CPU22: shutdown
[95.422630] psci: CPU22 killed (polled 0 ms)
[95.683772] irq_shutdown irq1384MiB/s,w=0KiB/s][r=380k,w=0 IOPS][eta 
00m:50s]
[95.687013] CPU23: shutdown
[95.689829] psci: CPU23 killed (polled 0 ms)
[96.379817] CPU24: shutdown
[96.382617] psci: CPU24 killed (polled 0 ms)
[96.703823] CPU25: shutdown=1902MiB/s,w=0KiB/s][r=487k,w=0 IOPS][eta 
00m:49s]
[96.706667] psci: CPU25 killed (polled 0 ms)
[97.299845] CPU26: shutdown
[97.302647] psci: CPU26 killed (polled 0 ms)
[97.331737] irq_shutdown irq139
[97.334969] CPU27: shutdown
[97.337774] psci: CPU27 killed (polled 0 ms)
[  102.308329] CPU28: shutdown=1954MiB/s,w=0KiB/s][r=500k,w=0 IOPS][eta 
00m:44s]
[  102.311137] psci: CPU28 killed (polled 0 ms)
[  102.663877] CPU29: shutdown=1755MiB/s,w=0KiB/s][r=449k,w=0 IOPS][eta 
00m:43s]
[  102.666679] psci: CPU29 killed (polled 0 ms)
[  104.459867] CPU30: shutdown=1552MiB/s,w=0KiB/s][r=397k,w=0 IOPS][eta 
00m:42s]
[  104.462668] psci: CPU30 killed (polled 0 ms)
[  106.259909] IRQ 43: no longer affine to CPU31s][r=414k,w=0 IOPS][eta 
00m:40s]
[  106.264273] IRQ 114: no longer affine to CPU31
[  106.268707] IRQ 118: no longer affine to CPU31
[  106.273141] IRQ 123: no longer affine to CPU31
[  106.277579] irq_shutdown irq140
[  106.280757] IRQ 332: no longer affine to CPU31
[  106.285190] IRQ 336: no longer affine to CPU31
[  106.289623] IRQ 338: no longer affine to CPU31
[  106.294056] IRQ 340: no longer affine to CPU31
[  106.298489] IRQ 344: no longer affine to CPU31
[  106.302922] IRQ 348: no longer affine to CPU31
[  106.307450] CPU31: shutdown
[  106.310251] psci: CPU31 killed (polled 0 ms)
[  108.367884] CPU32: shutdown=1693MiB/s,w=0KiB/s][r=434k,w=0 IOPS][eta 
00m:38s]
[  108.370683] psci: CPU32 killed (polled 0 ms)
[  110.808471] CPU33: shutdown=1747MiB/s,w=0KiB/s][r=447k,w=0 IOPS][eta 
00m:35s]
[  110.811269] psci: CPU33 killed (polled 0 ms)
[  110.871804] CPU34: shutdown
[  110.874598] psci: CPU34 killed (polled 0 ms)
[  111.767879] irq_shutdown irq1412MiB/s,w=0KiB/s][r=418k,w=0 IOPS][eta 
00m:34s]
[  111.771171] CPU35: shutdown
[  111.773975] psci: CPU35 killed (polled 0 ms)
[  113.167866] CPU36: shutdown=1680MiB/s,w=0KiB/s][r=430k,w=0 IOPS][eta 
00m:33s]
[  113.170673] psci: CPU36 killed (polled 0 ms)
[  115.783879] CPU37: shutdown=1839MiB/s,w=0KiB/s][r=471k,w=0 IOPS][eta 
00m:30s]
[  115.786675] psci: CPU37 killed (polled 0 ms)
[  117.111877] CPU38: shutdown=2032MiB/s,w=0KiB/s][r=520k,w=0 IOPS][eta 
00m:29s]
[  117.114670] psci: CPU38 killed (polled 0 ms)
[  121.263701] irq_shutdown irq1423MiB/s,w=0KiB/s][r=428k,w=0 IOPS][eta 
00m:25s]
[  121.266996] CPU39: shutdown
[  121.269797] psci: CPU39 killed (polled 0 ms)
[  121.364122] CPU40: shutdown
[  121.366918] psci: CPU40 killed (polled 0 ms)
[  121.448256] CPU41: shutdown
[  121.451058] psci: CPU41 killed (polled 0 ms)
[  121.515785] CPU42: shutdown=1711MiB/s,w=0KiB/s][r=438k,w=0 IOPS][eta 
00m:24s]
[  121.518586] psci: CPU42 killed (polled 0 ms)
[  121.563659] irq_shutdown irq143
[  121.566922] CPU43: shutdown
[  121.569732] psci: CPU43 killed (polled 0 ms)
[  122.619423] scsi_times_out req=0xffff001faf906900=371k,w=0 IOPS][eta 
00m:23s]
[  122.624131] scsi_times_out req=0xffff001faf90ed00
[  122.628842] scsi_times_out req=0xffff001faf90fc00
[  122.633547] scsi_times_out req=0xffff001faf906000
[  122.638259] scsi_times_out req=0xffff001faf906300
[  122.642969] scsi_times_out req=0xffff001faf906600
[  122.647676] scsi_times_out req=0xffff001faf906c00
[  122.652381] scsi_times_out req=0xffff001faf906f00
[  122.657086] scsi_times_out req=0xffff001faf907200
[  122.661791] scsi_times_out req=0xffff001faf907500
[  122.666498] scsi_times_out req=0xffff001faf907800
[  122.671203] scsi_times_out req=0xffff001faf907b00
[  122.675909] scsi_times_out req=0xffff001faf907e00
[  122.680615] scsi_times_out req=0xffff001faf908100
[  122.685319] scsi_times_out req=0xffff001faf908400
[  122.690025] scsi_times_out req=0xffff001faf908700
[  122.694731] scsi_times_out req=0xffff001faf908a00
[  122.699439] scsi_times_out req=0xffff001faf908d00
[  122.704145] scsi_times_out req=0xffff001faf909000
[  122.708849] scsi_times_out req=0xffff001faf909300


And another:

51.336698] CPU53: shutdown
[  251.336737] run queue from wrong CPU 0, hctx active
[  251.339545] run queue from wrong CPU 0, hctx active
[  251.344695] psci: CPU53 killed (polled 0 ms)
[  251.349302] run queue from wrong CPU 0, hctx active
[  251.358447] run queue from wrong CPU 0, hctx active
[  251.363336] run queue from wrong CPU 0, hctx active
[  251.368222] run queue from wrong CPU 0, hctx active
[  251.373108] run queue from wrong CPU 0, hctx active
[  252.488831] CPU54: shutdown=1094MiB/s,w=0KiB/s][r=280k,w=0 IOPS][eta 
00m:04s]
[  252.491629] psci: CPU54 killed (polled 0 ms)
[  252.536533] irq_shutdown irq146
[  252.540007] CPU55: shutdown
[  252.543273] psci: CPU55 killed (polled 0 ms)
[  252.656656] CPU56: shutdown
[  252.659446] psci: CPU56 killed (polled 0 ms)
[  252.704576] CPU57: shutdown=947MiB/s,w=0KiB/s][r=242k,w=0 IOPS][eta 
00m:02s]
[  252.707369] psci: CPU57 killed (polled 0 ms)
[  254.352646] CPU58: shutdown=677MiB/s,w=0KiB/s][r=173k,w=0 IOPS][eta 
00m:02s]
[  254.355442] psci: CPU58 killed (polled 0 ms)
long sleep 1
[  279.288227] scsi_times_out req=0xffff041fa10b6600[r=0,w=0 IOPS][eta 
04m:37s]
[  279.320281] sas: Enter sas_scsi_recover_host busy: 1 failed: 1
[  279.326144] sas: trying to find task 0x00000000e8dca422
[  279.331379] sas: sas_scsi_find_task: aborting task 0x00000000e8dca422

none scheduler seems ok.

>> So I did see this for mq-deadline scheduler:
>>
>> ] Unable to handle kernel NULL pointer dereference at virtual address
>> 0000000000000040
>> [  518.688402] Mem abort info:
>> [  518.691183]ESR = 0x96000004
>> [  518.694233]EC = 0x25: DABT (current EL), IL = 32 bits
>> [  518.699568]SET = 0, FnV = 0
>> [  518.702667] irq_shutdown irq143
>> [  518.705937] CPU43: shutdown
>> [  518.706093]EA = 0, S1PTW = 0
>> [  518.708733] psci: CPU43 killed (polled 0 ms)
>> [  518.716141] Data abort info:
>> [  518.719050]ISV = 0, ISS = 0x00000004
>> [  518.722892]CM = 0, WnR = 0
>> [  518.725867] user pgtable: 4k pages, 48-bit VAs, pgdp=0000041fa4eae000
>> [  518.732318] [0000000000000040] pgd=0000000000000000
>> [  518.737208] Internal error: Oops: 96000004 [#1] PREEMPT SMP
>> [  518.742768] Modules linked in:
>> [  518.745812] CPU: 0 PID: 1309 Comm: kworker/43:1H Not tainted
>> 5.7.0-rc2-00125-gf498b5de8a1c #390
>> [  518.754496] Hardware name: Huawei Taishan 2280 /D05, BIOS Hisilicon D05 IT21
>> Nemo 2.0 RC0 04/18/2018
>> [  518.763624] Workqueue: kblockd blk_mq_run_work_fn
>> [  518.768315] pstate: 80000005 (Nzcv daif -PAN -UAO)
>> [  518.773094] pc : blk_mq_put_tag+0x2c/0x68
>> [  518.777091] lr : blk_mq_get_tag+0x2ec/0x338
>> [  518.781260] sp : ffff800021a53ab0
>> [  518.784562] x29: ffff800021a53ab0 x28: ffff041f9e81d248
>> [  518.789861] x27: 00000000000002aa x26: ffff041fa3e49300
>> [  518.795159] x25: ffff001fb0178fc0 x24: ffff800021a53c68
>> [  518.800458] x23: 0000000000000000 x22: 0000000000000000
>> [  518.805756] x21: ffff800011a69000 x20: ffff041fa3e49310
>> [  518.811055] x19: ffff800021a53b90 x18: 0000000000000004
>> [  518.816354] x17: 0000000000000000 x16: 0000000000000000
>> [  518.821652] x15: 0000000000000000 x14: 00000000000000ef
>> [  518.826951] x13: 00000000000000ef x12: 0000000000000001
>> [  518.832249] x11: 0000000000000000 x10: 00000000000009c0
>> [  518.837548] x9 : ffff800021a53d50 x8 : ffff041f9e39a620
>> [  518.842847] x7 : fefefefefefefeff x6 : 0000000000000000
>> [  518.848146] x5 : 400007ffffffffff x4 : 0000000000001000
>> [  518.853445] x3 : 00000000000002aa x2 : 0000000000000000
>> [  518.858743] x1 : 0000000000000000 x0 : ffff041fa3e49300
>> [  518.864042] Call trace:
>> [  518.866477]  blk_mq_put_tag+0x2c/0x68
>> [  518.870126]  blk_mq_get_tag+0x2ec/0x338
>> [  518.873948]  blk_mq_get_driver_tag+0xf8/0x168
>> [  518.878292]  blk_mq_dispatch_rq_list+0x118/0x670
>> [  518.882896]  blk_mq_do_dispatch_sched+0xb0/0x148
>> [  518.887500]  __blk_mq_sched_dispatch_requests+0xec/0x1d8
>> [  518.892799]  blk_mq_sched_dispatch_requests+0x3c/0x78
>> [  518.897837]  __blk_mq_run_hw_queue+0xb4/0x130
>> [  518.902181]  blk_mq_run_work_fn+0x20/0x30
>> [  518.906179]  process_one_work+0x1e8/0x360
>> [  518.910176]  worker_thread+0x44/0x488
>> [  518.913825]  kthread+0x150/0x158
>> [  518.917041]  ret_from_fork+0x10/0x1c
>> [  518.920606] Code: b9400004 4b020063 6b04007f 540001e2 (b9404022)
>>
>> With Christoph's patch and "blk-mq: get ctx in order to handle BLK_MQ_S_INACTIVE
>> in blk_mq_get_tag()" [I didn't test without that patch], no issue detected so far.
>>
> 
> I think my patch is no longer required with Christoph's patch.
> 

ah, of course.

> Dongli Zhang

Thanks,
John
Ming Lei June 4, 2020, 11:26 a.m. UTC | #7
On Thu, Jun 04, 2020 at 11:24:29AM +0100, John Garry wrote:
> > > > > 
> > > > > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > > > > index 96a39d0724a29..cded7fdcad8ef 100644
> > > > > --- a/block/blk-mq-tag.c
> > > > > +++ b/block/blk-mq-tag.c
> > > > > @@ -191,6 +191,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
> > > > >        return tag + tag_offset;
> > > > >    }
> > > > >    +bool __blk_mq_get_driver_tag(struct request *rq)
> > > > > +{
> > > > > +    struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
> > > > > +    unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
> > > > > +    bool shared = blk_mq_tag_busy(rq->mq_hctx);
> > > > 
> > > > Not necessary to add 'shared' which is just used once.
> > > > 
> > > > > +    int tag;
> > > > > +
> > > > > +    if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
> > > > > +        bt = &rq->mq_hctx->tags->breserved_tags;
> > > > 
> > > > Too many rq->mq_hctx->tags, you can add one local variable to store it.
> > > > 
> > > > Otherwise, looks fine:
> > > > 
> > > > Reviewed-by: Ming Lei <ming.lei@redhat.com>
> 
> Unfortunately this looks to still have problems. So, for a start, this
> stackframe is pumped out a lot (which I removed, for below):
> 
> [  697.464058] Workqueue: kblockd blk_mq_run_work_fn
> [  697.468749] Call trace:
> [  697.471184]  dump_backtrace+0x0/0x1c0
> [  697.474833]  show_stack+0x18/0x28
> [  697.478136]  dump_stack+0xc8/0x114
> [  697.481524]  __blk_mq_run_hw_queue+0x124/0x130
> [  697.485954]  blk_mq_run_work_fn+0x20/0x30
> [  697.489951]  process_one_work+0x1e8/0x360
> [  697.493947]  worker_thread+0x238/0x478
> [  697.497683]  kthread+0x150/0x158
> [  697.500898]  ret_from_fork+0x10/0x1c
> [  697.504480] run queue from wrong CPU 32, hctx active
> [  697.509444] CPU: 32 PID: 1477 Comm: kworker/21:2H Not tainted 5.7.0-
> 
> And then other time I get SCSI timeouts:
> 
> [95.170616] psci: CPU21 killed (polled 0 ms)
> [95.419832] CPU22: shutdown
> [95.422630] psci: CPU22 killed (polled 0 ms)
> [95.683772] irq_shutdown irq1384MiB/s,w=0KiB/s][r=380k,w=0 IOPS][eta
> 00m:50s]
> [95.687013] CPU23: shutdown
> [95.689829] psci: CPU23 killed (polled 0 ms)
> [96.379817] CPU24: shutdown
> [96.382617] psci: CPU24 killed (polled 0 ms)
> [96.703823] CPU25: shutdown=1902MiB/s,w=0KiB/s][r=487k,w=0 IOPS][eta
> 00m:49s]
> [96.706667] psci: CPU25 killed (polled 0 ms)
> [97.299845] CPU26: shutdown
> [97.302647] psci: CPU26 killed (polled 0 ms)
> [97.331737] irq_shutdown irq139
> [97.334969] CPU27: shutdown
> [97.337774] psci: CPU27 killed (polled 0 ms)
> [  102.308329] CPU28: shutdown=1954MiB/s,w=0KiB/s][r=500k,w=0 IOPS][eta
> 00m:44s]
> [  102.311137] psci: CPU28 killed (polled 0 ms)
> [  102.663877] CPU29: shutdown=1755MiB/s,w=0KiB/s][r=449k,w=0 IOPS][eta
> 00m:43s]
> [  102.666679] psci: CPU29 killed (polled 0 ms)
> [  104.459867] CPU30: shutdown=1552MiB/s,w=0KiB/s][r=397k,w=0 IOPS][eta
> 00m:42s]
> [  104.462668] psci: CPU30 killed (polled 0 ms)
> [  106.259909] IRQ 43: no longer affine to CPU31s][r=414k,w=0 IOPS][eta
> 00m:40s]
> [  106.264273] IRQ 114: no longer affine to CPU31
> [  106.268707] IRQ 118: no longer affine to CPU31
> [  106.273141] IRQ 123: no longer affine to CPU31
> [  106.277579] irq_shutdown irq140
> [  106.280757] IRQ 332: no longer affine to CPU31
> [  106.285190] IRQ 336: no longer affine to CPU31
> [  106.289623] IRQ 338: no longer affine to CPU31
> [  106.294056] IRQ 340: no longer affine to CPU31
> [  106.298489] IRQ 344: no longer affine to CPU31
> [  106.302922] IRQ 348: no longer affine to CPU31
> [  106.307450] CPU31: shutdown
> [  106.310251] psci: CPU31 killed (polled 0 ms)
> [  108.367884] CPU32: shutdown=1693MiB/s,w=0KiB/s][r=434k,w=0 IOPS][eta
> 00m:38s]
> [  108.370683] psci: CPU32 killed (polled 0 ms)
> [  110.808471] CPU33: shutdown=1747MiB/s,w=0KiB/s][r=447k,w=0 IOPS][eta
> 00m:35s]
> [  110.811269] psci: CPU33 killed (polled 0 ms)
> [  110.871804] CPU34: shutdown
> [  110.874598] psci: CPU34 killed (polled 0 ms)
> [  111.767879] irq_shutdown irq1412MiB/s,w=0KiB/s][r=418k,w=0 IOPS][eta
> 00m:34s]
> [  111.771171] CPU35: shutdown
> [  111.773975] psci: CPU35 killed (polled 0 ms)
> [  113.167866] CPU36: shutdown=1680MiB/s,w=0KiB/s][r=430k,w=0 IOPS][eta
> 00m:33s]
> [  113.170673] psci: CPU36 killed (polled 0 ms)
> [  115.783879] CPU37: shutdown=1839MiB/s,w=0KiB/s][r=471k,w=0 IOPS][eta
> 00m:30s]
> [  115.786675] psci: CPU37 killed (polled 0 ms)
> [  117.111877] CPU38: shutdown=2032MiB/s,w=0KiB/s][r=520k,w=0 IOPS][eta
> 00m:29s]
> [  117.114670] psci: CPU38 killed (polled 0 ms)
> [  121.263701] irq_shutdown irq1423MiB/s,w=0KiB/s][r=428k,w=0 IOPS][eta
> 00m:25s]
> [  121.266996] CPU39: shutdown
> [  121.269797] psci: CPU39 killed (polled 0 ms)
> [  121.364122] CPU40: shutdown
> [  121.366918] psci: CPU40 killed (polled 0 ms)
> [  121.448256] CPU41: shutdown
> [  121.451058] psci: CPU41 killed (polled 0 ms)
> [  121.515785] CPU42: shutdown=1711MiB/s,w=0KiB/s][r=438k,w=0 IOPS][eta
> 00m:24s]
> [  121.518586] psci: CPU42 killed (polled 0 ms)
> [  121.563659] irq_shutdown irq143
> [  121.566922] CPU43: shutdown
> [  121.569732] psci: CPU43 killed (polled 0 ms)
> [  122.619423] scsi_times_out req=0xffff001faf906900=371k,w=0 IOPS][eta
> 00m:23s]
> [  122.624131] scsi_times_out req=0xffff001faf90ed00
> [  122.628842] scsi_times_out req=0xffff001faf90fc00
> [  122.633547] scsi_times_out req=0xffff001faf906000
> [  122.638259] scsi_times_out req=0xffff001faf906300
> [  122.642969] scsi_times_out req=0xffff001faf906600
> [  122.647676] scsi_times_out req=0xffff001faf906c00
> [  122.652381] scsi_times_out req=0xffff001faf906f00
> [  122.657086] scsi_times_out req=0xffff001faf907200
> [  122.661791] scsi_times_out req=0xffff001faf907500
> [  122.666498] scsi_times_out req=0xffff001faf907800
> [  122.671203] scsi_times_out req=0xffff001faf907b00
> [  122.675909] scsi_times_out req=0xffff001faf907e00
> [  122.680615] scsi_times_out req=0xffff001faf908100
> [  122.685319] scsi_times_out req=0xffff001faf908400
> [  122.690025] scsi_times_out req=0xffff001faf908700
> [  122.694731] scsi_times_out req=0xffff001faf908a00
> [  122.699439] scsi_times_out req=0xffff001faf908d00
> [  122.704145] scsi_times_out req=0xffff001faf909000
> [  122.708849] scsi_times_out req=0xffff001faf909300
> 
> 
> And another:
> 
> 51.336698] CPU53: shutdown
> [  251.336737] run queue from wrong CPU 0, hctx active
> [  251.339545] run queue from wrong CPU 0, hctx active
> [  251.344695] psci: CPU53 killed (polled 0 ms)
> [  251.349302] run queue from wrong CPU 0, hctx active
> [  251.358447] run queue from wrong CPU 0, hctx active
> [  251.363336] run queue from wrong CPU 0, hctx active
> [  251.368222] run queue from wrong CPU 0, hctx active
> [  251.373108] run queue from wrong CPU 0, hctx active
> [  252.488831] CPU54: shutdown=1094MiB/s,w=0KiB/s][r=280k,w=0 IOPS][eta
> 00m:04s]
> [  252.491629] psci: CPU54 killed (polled 0 ms)
> [  252.536533] irq_shutdown irq146
> [  252.540007] CPU55: shutdown
> [  252.543273] psci: CPU55 killed (polled 0 ms)
> [  252.656656] CPU56: shutdown
> [  252.659446] psci: CPU56 killed (polled 0 ms)
> [  252.704576] CPU57: shutdown=947MiB/s,w=0KiB/s][r=242k,w=0 IOPS][eta
> 00m:02s]
> [  252.707369] psci: CPU57 killed (polled 0 ms)
> [  254.352646] CPU58: shutdown=677MiB/s,w=0KiB/s][r=173k,w=0 IOPS][eta
> 00m:02s]
> [  254.355442] psci: CPU58 killed (polled 0 ms)
> long sleep 1
> [  279.288227] scsi_times_out req=0xffff041fa10b6600[r=0,w=0 IOPS][eta
> 04m:37s]
> [  279.320281] sas: Enter sas_scsi_recover_host busy: 1 failed: 1
> [  279.326144] sas: trying to find task 0x00000000e8dca422
> [  279.331379] sas: sas_scsi_find_task: aborting task 0x00000000e8dca422
> 
> none scheduler seems ok.

Can you reproduce it by just applying the patch of 'blk-mq: don't fail driver tag
allocation because of inactive hctx'?

If yes, can you collect debugfs log after the timeout is triggered?


Thanks,
Ming
John Garry June 4, 2020, 11:50 a.m. UTC | #8
On 04/06/2020 12:26, Ming Lei wrote:
>> And another:
>>
>> 51.336698] CPU53: shutdown
>> [  251.336737] run queue from wrong CPU 0, hctx active
>> [  251.339545] run queue from wrong CPU 0, hctx active
>> [  251.344695] psci: CPU53 killed (polled 0 ms)
>> [  251.349302] run queue from wrong CPU 0, hctx active
>> [  251.358447] run queue from wrong CPU 0, hctx active
>> [  251.363336] run queue from wrong CPU 0, hctx active
>> [  251.368222] run queue from wrong CPU 0, hctx active
>> [  251.373108] run queue from wrong CPU 0, hctx active
>> [  252.488831] CPU54: shutdown=1094MiB/s,w=0KiB/s][r=280k,w=0 IOPS][eta
>> 00m:04s]
>> [  252.491629] psci: CPU54 killed (polled 0 ms)
>> [  252.536533] irq_shutdown irq146
>> [  252.540007] CPU55: shutdown
>> [  252.543273] psci: CPU55 killed (polled 0 ms)
>> [  252.656656] CPU56: shutdown
>> [  252.659446] psci: CPU56 killed (polled 0 ms)
>> [  252.704576] CPU57: shutdown=947MiB/s,w=0KiB/s][r=242k,w=0 IOPS][eta
>> 00m:02s]
>> [  252.707369] psci: CPU57 killed (polled 0 ms)
>> [  254.352646] CPU58: shutdown=677MiB/s,w=0KiB/s][r=173k,w=0 IOPS][eta
>> 00m:02s]
>> [  254.355442] psci: CPU58 killed (polled 0 ms)
>> long sleep 1
>> [  279.288227] scsi_times_out req=0xffff041fa10b6600[r=0,w=0 IOPS][eta
>> 04m:37s]
>> [  279.320281] sas: Enter sas_scsi_recover_host busy: 1 failed: 1
>> [  279.326144] sas: trying to find task 0x00000000e8dca422
>> [  279.331379] sas: sas_scsi_find_task: aborting task 0x00000000e8dca422
>>
>> none scheduler seems ok.
> Can you reproduce it by just applying the patch of 'blk-mq: don't fail driver tag
> allocation because of inactive hctx'?

That's your patch - ok, I can try.

So I added some debug for when using Christoph's:

static bool blk_mq_hctx_has_requests2(struct blk_mq_hw_ctx *hctx)
{
	struct blk_mq_tags *tags = hctx->tags;
	struct rq_iter_data data = {
		.hctx	= hctx,
	};

	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
	return data.has_rq;
}


static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
		struct blk_mq_hw_ctx *hctx)
{
@@ -2386,6 +2398,8 @@ static int blk_mq_hctx_notify_offline(unsigned int 
cpu, struct hlist_node *node)
	if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
		while (blk_mq_hctx_has_requests(hctx))
			msleep(5);
+		if (blk_mq_hctx_has_requests2(hctx))
+			pr_err("%s still has driver tag - but how? hctx->sched_tags=%pS\n", 
__func__, hctx->sched_tags);
		percpu_ref_put(&hctx->queue->q_usage_counter);
	} else {

And I get this:

[  366.839260] CPU46: shutdown
[  366.842052] psci: CPU46 killed (polled 0 ms)
[  366.910923] blk_mq_hctx_notify_offline still has driver tag - but 
how? hctx->sched_tags=0xffff041fa59e0d00
[  366.920870] irq_migrate_all_off_this_cpu: 5 callbacks suppressed
[  366.920871] IRQ 113: no longer affine to CPU47
[  366.931299] IRQ 116: no longer affine to CPU47
[  366.935734] IRQ 119: no longer affine to CPU47
[  366.940169] IRQ 123: no longer affine to CPU47
[  366.944607] irq_shutdown irq144
[  366.947783] IRQ 330: no longer affine to CPU47
[  366.952217] IRQ 333: no longer affine to CPU47
[  366.956651] IRQ 336: no longer affine to CPU47
[  366.961085] IRQ 339: no longer affine to CPU47
[  366.965519] IRQ 342: no longer affine to CPU47
[  366.969953] IRQ 345: no longer affine to CPU47
[  366.974475] CPU47: shutdown
[  366.977273] psci: CPU47 killed (polled 0 ms)
[  367.063066] CPU48: shutdown
[  367.065857] psci: CPU48 killed (polled 0 ms)
[  367.107071] CPU49: shutdown
[  367.109858] psci: CPU49 killed (polled 0 ms)
[  367.147057] CPU50: shutdown
[  367.149843] psci: CPU50 killed (polled 0 ms)
[  367.175006] irq_shutdown irq145
[  367.178272] CPU51: shutdown
[  367.181070] psci: CPU51 killed (polled 0 ms)
[  367.223067] CPU52: shutdown
[  367.225856] psci: CPU52 killed (polled 0 ms)
[  367.267024] CPU53: shutdown
[  367.269821] psci: CPU53 killed (polled 0 ms)
[  367.303004] CPU54: shutdown
[  367.305792] psci: CPU54 killed (polled 0 ms)
[  367.342948] irq_shutdown irq146
[  367.346169] CPU55: shutdown
[  367.348965] psci: CPU55 killed (polled 0 ms)
[  367.387004] CPU56: shutdown][16.9%][r=869MiB/s,w=0KiB/s][r=223k,w=0 
IOPS][eta 04m:59s]
[  367.389791] psci: CPU56 killed (polled 0 ms)
[  367.419017] CPU57: shutdown
[  367.421804] psci: CPU57 killed (polled 0 ms)
[  367.447010] CPU58: shutdown
[  367.449796] psci: CPU58 killed (polled 0 ms)
long sleep 2
[  398.070707] scsi_times_out req=0xffff041fa1a89c00iB/s][r=0,w=0 
IOPS][eta 04m:29s]

I thought that if all the sched tags are put, then we should have no 
driver tag for that same hctx, right? That seems to coincide with the 
timeout (30 seconds later)

> 
> If yes, can you collect debugfs log after the timeout is triggered?

Same limitation as before - once SCSI timeout happens, SCSI error 
handling kicks in and the shost no longer accepts commands, and, since 
that same shost provides rootfs, becomes unresponsive. But I can try.

Cheers,
John
Ming Lei June 4, 2020, 12:08 p.m. UTC | #9
On Thu, Jun 04, 2020 at 12:50:32PM +0100, John Garry wrote:
> On 04/06/2020 12:26, Ming Lei wrote:
> > > And another:
> > > 
> > > 51.336698] CPU53: shutdown
> > > [  251.336737] run queue from wrong CPU 0, hctx active
> > > [  251.339545] run queue from wrong CPU 0, hctx active
> > > [  251.344695] psci: CPU53 killed (polled 0 ms)
> > > [  251.349302] run queue from wrong CPU 0, hctx active
> > > [  251.358447] run queue from wrong CPU 0, hctx active
> > > [  251.363336] run queue from wrong CPU 0, hctx active
> > > [  251.368222] run queue from wrong CPU 0, hctx active
> > > [  251.373108] run queue from wrong CPU 0, hctx active
> > > [  252.488831] CPU54: shutdown=1094MiB/s,w=0KiB/s][r=280k,w=0 IOPS][eta
> > > 00m:04s]
> > > [  252.491629] psci: CPU54 killed (polled 0 ms)
> > > [  252.536533] irq_shutdown irq146
> > > [  252.540007] CPU55: shutdown
> > > [  252.543273] psci: CPU55 killed (polled 0 ms)
> > > [  252.656656] CPU56: shutdown
> > > [  252.659446] psci: CPU56 killed (polled 0 ms)
> > > [  252.704576] CPU57: shutdown=947MiB/s,w=0KiB/s][r=242k,w=0 IOPS][eta
> > > 00m:02s]
> > > [  252.707369] psci: CPU57 killed (polled 0 ms)
> > > [  254.352646] CPU58: shutdown=677MiB/s,w=0KiB/s][r=173k,w=0 IOPS][eta
> > > 00m:02s]
> > > [  254.355442] psci: CPU58 killed (polled 0 ms)
> > > long sleep 1
> > > [  279.288227] scsi_times_out req=0xffff041fa10b6600[r=0,w=0 IOPS][eta
> > > 04m:37s]
> > > [  279.320281] sas: Enter sas_scsi_recover_host busy: 1 failed: 1
> > > [  279.326144] sas: trying to find task 0x00000000e8dca422
> > > [  279.331379] sas: sas_scsi_find_task: aborting task 0x00000000e8dca422
> > > 
> > > none scheduler seems ok.
> > Can you reproduce it by just applying the patch of 'blk-mq: don't fail driver tag
> > allocation because of inactive hctx'?
> 
> That's your patch - ok, I can try.
> 
> So I added some debug for when using Christoph's:
> 
> static bool blk_mq_hctx_has_requests2(struct blk_mq_hw_ctx *hctx)
> {
> 	struct blk_mq_tags *tags = hctx->tags;
> 	struct rq_iter_data data = {
> 		.hctx	= hctx,
> 	};
> 
> 	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
> 	return data.has_rq;
> }
> 
> 
> static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
> 		struct blk_mq_hw_ctx *hctx)
> {
> @@ -2386,6 +2398,8 @@ static int blk_mq_hctx_notify_offline(unsigned int
> cpu, struct hlist_node *node)
> 	if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
> 		while (blk_mq_hctx_has_requests(hctx))
> 			msleep(5);
> +		if (blk_mq_hctx_has_requests2(hctx))
> +			pr_err("%s still has driver tag - but how? hctx->sched_tags=%pS\n",
> __func__, hctx->sched_tags);
> 		percpu_ref_put(&hctx->queue->q_usage_counter);
> 	} else {
> 
> And I get this:
> 
> [  366.839260] CPU46: shutdown
> [  366.842052] psci: CPU46 killed (polled 0 ms)
> [  366.910923] blk_mq_hctx_notify_offline still has driver tag - but how?
> hctx->sched_tags=0xffff041fa59e0d00
> [  366.920870] irq_migrate_all_off_this_cpu: 5 callbacks suppressed
> [  366.920871] IRQ 113: no longer affine to CPU47
> [  366.931299] IRQ 116: no longer affine to CPU47
> [  366.935734] IRQ 119: no longer affine to CPU47
> [  366.940169] IRQ 123: no longer affine to CPU47
> [  366.944607] irq_shutdown irq144
> [  366.947783] IRQ 330: no longer affine to CPU47
> [  366.952217] IRQ 333: no longer affine to CPU47
> [  366.956651] IRQ 336: no longer affine to CPU47
> [  366.961085] IRQ 339: no longer affine to CPU47
> [  366.965519] IRQ 342: no longer affine to CPU47
> [  366.969953] IRQ 345: no longer affine to CPU47
> [  366.974475] CPU47: shutdown
> [  366.977273] psci: CPU47 killed (polled 0 ms)
> [  367.063066] CPU48: shutdown
> [  367.065857] psci: CPU48 killed (polled 0 ms)
> [  367.107071] CPU49: shutdown
> [  367.109858] psci: CPU49 killed (polled 0 ms)
> [  367.147057] CPU50: shutdown
> [  367.149843] psci: CPU50 killed (polled 0 ms)
> [  367.175006] irq_shutdown irq145
> [  367.178272] CPU51: shutdown
> [  367.181070] psci: CPU51 killed (polled 0 ms)
> [  367.223067] CPU52: shutdown
> [  367.225856] psci: CPU52 killed (polled 0 ms)
> [  367.267024] CPU53: shutdown
> [  367.269821] psci: CPU53 killed (polled 0 ms)
> [  367.303004] CPU54: shutdown
> [  367.305792] psci: CPU54 killed (polled 0 ms)
> [  367.342948] irq_shutdown irq146
> [  367.346169] CPU55: shutdown
> [  367.348965] psci: CPU55 killed (polled 0 ms)
> [  367.387004] CPU56: shutdown][16.9%][r=869MiB/s,w=0KiB/s][r=223k,w=0
> IOPS][eta 04m:59s]
> [  367.389791] psci: CPU56 killed (polled 0 ms)
> [  367.419017] CPU57: shutdown
> [  367.421804] psci: CPU57 killed (polled 0 ms)
> [  367.447010] CPU58: shutdown
> [  367.449796] psci: CPU58 killed (polled 0 ms)
> long sleep 2
> [  398.070707] scsi_times_out req=0xffff041fa1a89c00iB/s][r=0,w=0 IOPS][eta
> 04m:29s]
> 
> I thought that if all the sched tags are put, then we should have no driver
> tag for that same hctx, right? That seems to coincide with the timeout (30
> seconds later)

That is weird, if there is driver tag found, that means the request is
in-flight and can't be completed by HW. I assume you have integrated
global host tags patch in your test, and suggest you to double check
hisi_sas's queue mapping which has to be exactly same with blk-mq's
mapping.

> 
> > 
> > If yes, can you collect debugfs log after the timeout is triggered?
> 
> Same limitation as before - once SCSI timeout happens, SCSI error handling
> kicks in and the shost no longer accepts commands, and, since that same
> shost provides rootfs, becomes unresponsive. But I can try.

Just wondering why not install two disks in your test machine, :-)


Thanks,
Ming
John Garry June 4, 2020, 12:45 p.m. UTC | #10
>> That's your patch - ok, I can try.
>>

I still get timeouts and sometimes the same driver tag message occurs:

  1014.232417] run queue from wrong CPU 0, hctx active
[ 1014.237692] run queue from wrong CPU 0, hctx active
[ 1014.243014] run queue from wrong CPU 0, hctx active
[ 1014.248370] run queue from wrong CPU 0, hctx active
[ 1014.253725] run queue from wrong CPU 0, hctx active
[ 1014.259252] run queue from wrong CPU 0, hctx active
[ 1014.264492] run queue from wrong CPU 0, hctx active
[ 1014.269453] irq_shutdown irq146
[ 1014.272752] CPU55: shutdown
[ 1014.275552] psci: CPU55 killed (polled 0 ms)
[ 1015.151530] CPU56: shutdownr=1621MiB/s,w=0KiB/s][r=415k,w=0 IOPS][eta 
00m:00s]
[ 1015.154322] psci: CPU56 killed (polled 0 ms)
[ 1015.184345] CPU57: shutdown
[ 1015.187143] psci: CPU57 killed (polled 0 ms)
[ 1015.223388] CPU58: shutdown
[ 1015.226174] psci: CPU58 killed (polled 0 ms)
long sleep 8
[ 1045.234781] scsi_times_out req=0xffff041fa13e6300[r=0,w=0 IOPS][eta 
04m:30s]

[...]

>>
>> I thought that if all the sched tags are put, then we should have no driver
>> tag for that same hctx, right? That seems to coincide with the timeout (30
>> seconds later)
> 
> That is weird, if there is driver tag found, that means the request is
> in-flight and can't be completed by HW.

In blk_mq_hctx_has_requests(), we iterate the sched tags (when 
hctx->sched_tags is set). So can some requests not have a sched tag 
(even for scheduler set for the queue)?

  I assume you have integrated
> global host tags patch in your test,

No, but the LLDD does not use request->tag - it generates its own.

  and suggest you to double check
> hisi_sas's queue mapping which has to be exactly same with blk-mq's
> mapping.
> 

scheduler=none is ok, so I am skeptical of a problem there.

>>
>>>
>>> If yes, can you collect debugfs log after the timeout is triggered?
>>
>> Same limitation as before - once SCSI timeout happens, SCSI error handling
>> kicks in and the shost no longer accepts commands, and, since that same
>> shost provides rootfs, becomes unresponsive. But I can try.
> 
> Just wondering why not install two disks in your test machine, :-)

The shost becomes unresponsive for all disks. So I could try nfs, but 
I'm not a fan :)

Cheers
Ming Lei June 4, 2020, 1 p.m. UTC | #11
On Thu, Jun 04, 2020 at 01:45:09PM +0100, John Garry wrote:
> 
> > > That's your patch - ok, I can try.
> > > 
> 
> I still get timeouts and sometimes the same driver tag message occurs:
> 
>  1014.232417] run queue from wrong CPU 0, hctx active
> [ 1014.237692] run queue from wrong CPU 0, hctx active
> [ 1014.243014] run queue from wrong CPU 0, hctx active
> [ 1014.248370] run queue from wrong CPU 0, hctx active
> [ 1014.253725] run queue from wrong CPU 0, hctx active
> [ 1014.259252] run queue from wrong CPU 0, hctx active
> [ 1014.264492] run queue from wrong CPU 0, hctx active
> [ 1014.269453] irq_shutdown irq146
> [ 1014.272752] CPU55: shutdown
> [ 1014.275552] psci: CPU55 killed (polled 0 ms)
> [ 1015.151530] CPU56: shutdownr=1621MiB/s,w=0KiB/s][r=415k,w=0 IOPS][eta
> 00m:00s]
> [ 1015.154322] psci: CPU56 killed (polled 0 ms)
> [ 1015.184345] CPU57: shutdown
> [ 1015.187143] psci: CPU57 killed (polled 0 ms)
> [ 1015.223388] CPU58: shutdown
> [ 1015.226174] psci: CPU58 killed (polled 0 ms)
> long sleep 8
> [ 1045.234781] scsi_times_out req=0xffff041fa13e6300[r=0,w=0 IOPS][eta
> 04m:30s]
> 
> [...]
> 
> > > 
> > > I thought that if all the sched tags are put, then we should have no driver
> > > tag for that same hctx, right? That seems to coincide with the timeout (30
> > > seconds later)
> > 
> > That is weird, if there is driver tag found, that means the request is
> > in-flight and can't be completed by HW.
> 
> In blk_mq_hctx_has_requests(), we iterate the sched tags (when
> hctx->sched_tags is set). So can some requests not have a sched tag (even
> for scheduler set for the queue)?

Every request must have a scheduler tag in case of io scheduler.

> 
>  I assume you have integrated
> > global host tags patch in your test,
> 
> No, but the LLDD does not use request->tag - it generates its own.

It isn't related with request->tag, what I meant is that you use
out-of-tree patch to enable multiple hw queue on hisi_sas, you have
to make the queue mapping correct, that said the exact queue mapping
from blk-mq's mapping has to be used, which is built from managed
interrupt affinity.

Please collect the following log:

1) ./dump-io-irq-affinity $PCI_ID_OF_HBA
http://people.redhat.com/minlei/tests/tools/dump-io-irq-affinity

2) ./dump-qmap /dev/sdN
http://people.redhat.com/minlei/tests/tools/dump-qmap


> 
>  and suggest you to double check
> > hisi_sas's queue mapping which has to be exactly same with blk-mq's
> > mapping.
> > 
> 
> scheduler=none is ok, so I am skeptical of a problem there.
> 
> > > 
> > > > 
> > > > If yes, can you collect debugfs log after the timeout is triggered?
> > > 
> > > Same limitation as before - once SCSI timeout happens, SCSI error handling
> > > kicks in and the shost no longer accepts commands, and, since that same
> > > shost provides rootfs, becomes unresponsive. But I can try.
> > 
> > Just wondering why not install two disks in your test machine, :-)
> 
> The shost becomes unresponsive for all disks. So I could try nfs, but I'm
> not a fan :)

Then it will take you extra effort in collecting log, and NFS root
should have been quite easy to setup, :-)


Thanks,
Ming
Ming Lei June 4, 2020, 1:28 p.m. UTC | #12
On Thu, Jun 4, 2020 at 8:50 PM John Garry <john.garry@huawei.com> wrote:
>
>
> >> That's your patch - ok, I can try.
> >>
>
> I still get timeouts and sometimes the same driver tag message occurs:
>
>   1014.232417] run queue from wrong CPU 0, hctx active
> [ 1014.237692] run queue from wrong CPU 0, hctx active
> [ 1014.243014] run queue from wrong CPU 0, hctx active
> [ 1014.248370] run queue from wrong CPU 0, hctx active
> [ 1014.253725] run queue from wrong CPU 0, hctx active
> [ 1014.259252] run queue from wrong CPU 0, hctx active
> [ 1014.264492] run queue from wrong CPU 0, hctx active
> [ 1014.269453] irq_shutdown irq146
> [ 1014.272752] CPU55: shutdown
> [ 1014.275552] psci: CPU55 killed (polled 0 ms)
> [ 1015.151530] CPU56: shutdownr=1621MiB/s,w=0KiB/s][r=415k,w=0 IOPS][eta
> 00m:00s]
> [ 1015.154322] psci: CPU56 killed (polled 0 ms)
> [ 1015.184345] CPU57: shutdown
> [ 1015.187143] psci: CPU57 killed (polled 0 ms)
> [ 1015.223388] CPU58: shutdown
> [ 1015.226174] psci: CPU58 killed (polled 0 ms)
> long sleep 8
> [ 1045.234781] scsi_times_out req=0xffff041fa13e6300[r=0,w=0 IOPS][eta
> 04m:30s]
>
> [...]
>
> >>
> >> I thought that if all the sched tags are put, then we should have no driver
> >> tag for that same hctx, right? That seems to coincide with the timeout (30
> >> seconds later)
> >
> > That is weird, if there is driver tag found, that means the request is
> > in-flight and can't be completed by HW.
>
> In blk_mq_hctx_has_requests(), we iterate the sched tags (when
> hctx->sched_tags is set). So can some requests not have a sched tag
> (even for scheduler set for the queue)?
>
>   I assume you have integrated
> > global host tags patch in your test,
>
> No, but the LLDD does not use request->tag - it generates its own.

Except for wrong queue mapping,  another reason is that the generated
tag may not
be unique. Either of two may cause such timeout issue when the managed
interrupt is
active.

Thanks,
Ming
John Garry June 4, 2020, 2:11 p.m. UTC | #13
> 
> It isn't related with request->tag, what I meant is that you use
> out-of-tree patch to enable multiple hw queue on hisi_sas, you have
> to make the queue mapping correct, that said the exact queue mapping
> from blk-mq's mapping has to be used, which is built from managed
> interrupt affinity.
> 
> Please collect the following log:
> 
> 1) ./dump-io-irq-affinity $PCI_ID_OF_HBA
> http://people.redhat.com/minlei/tests/tools/dump-io-irq-affinity
> 

I had to hack this a bit for SAS HBA:

kernel version:
Linux ubuntu 5.7.0-next-20200603-16498-gbfbfcda762d5 #405 SMP PREEMPT 
Thu Jun 4 14:19:49 BST 2020 aarch64 aarch64 aarch64 GNU/Linux
-e 	irq 137, cpu list 16-19, effective list 16
-e 	irq 138, cpu list 20-23, effective list 20
-e 	irq 139, cpu list 24-27, effective list 24
-e 	irq 140, cpu list 28-31, effective list 28
-e 	irq 141, cpu list 32-35, effective list 32
-e 	irq 142, cpu list 36-39, effective list 36
-e 	irq 143, cpu list 40-43, effective list 40
-e 	irq 144, cpu list 44-47, effective list 44
-e 	irq 145, cpu list 48-51, effective list 48
-e 	irq 146, cpu list 52-55, effective list 52
-e 	irq 147, cpu list 56-59, effective list 56
-e 	irq 148, cpu list 60-63, effective list 60
-e 	irq 149, cpu list 0-3, effective list 0
-e 	irq 150, cpu list 4-7, effective list 4
-e 	irq 151, cpu list 8-11, effective list 8
-e 	irq 152, cpu list 12-15, effective list 12

> 2) ./dump-qmap /dev/sdN
> http://people.redhat.com/minlei/tests/tools/dump-qmap


queue mapping for /dev/sda
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sdb
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sdc
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sdd
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sde
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sdf
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sdg
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15
queue mapping for /dev/sdh
	hctx0: default 16 17 18 19
	hctx1: default 20 21 22 23
	hctx2: default 24 25 26 27
	hctx3: default 28 29 30 31
	hctx4: default 32 33 34 35
	hctx5: default 36 37 38 39
	hctx6: default 40 41 42 43
	hctx7: default 44 45 46 47
	hctx8: default 48 49 50 51
	hctx9: default 52 53 54 55
	hctx10: default 56 57 58 59
	hctx11: default 60 61 62 63
	hctx12: default 0 1 2 3
	hctx13: default 4 5 6 7
	hctx14: default 8 9 10 11
	hctx15: default 12 13 14 15

> 
> 
>>
>>   and suggest you to double check
>>> hisi_sas's queue mapping which has to be exactly same with blk-mq's
>>> mapping.
>>>
>>
>> scheduler=none is ok, so I am skeptical of a problem there.
>>
>>>>
>>>>>
>>>>> If yes, can you collect debugfs log after the timeout is triggered?
>>>>
>>>> Same limitation as before - once SCSI timeout happens, SCSI error handling
>>>> kicks in and the shost no longer accepts commands, and, since that same
>>>> shost provides rootfs, becomes unresponsive. But I can try.
>>>
>>> Just wondering why not install two disks in your test machine, :-)
>>
>> The shost becomes unresponsive for all disks. So I could try nfs, but I'm
>> not a fan :)
> 
> Then it will take you extra effort in collecting log, and NFS root
> should have been quite easy to setup, :-)
> 

Should be ...

 >> No, but the LLDD does not use request->tag - it generates its own.
 >
 > Except for wrong queue mapping,  another reason is that the generated
 > tag may not
 > be unique. Either of two may cause such timeout issue when the managed
 > interrupt is
 > active.
 >

Right, but the tag should be unique - it needs to be in the LLDD.

Anyway, I'll continue to debug.

BTW, I'm using linux-next 0306 as baseline. I don't like using next, but 
Linus' master branch yesterday was crashing while booting for me. I need 
to check that again for where master is now.

Thanks
Ming Lei June 5, 2020, 12:59 a.m. UTC | #14
On Thu, Jun 04, 2020 at 03:11:39PM +0100, John Garry wrote:
> > 
> > It isn't related with request->tag, what I meant is that you use
> > out-of-tree patch to enable multiple hw queue on hisi_sas, you have
> > to make the queue mapping correct, that said the exact queue mapping
> > from blk-mq's mapping has to be used, which is built from managed
> > interrupt affinity.
> > 
> > Please collect the following log:
> > 
> > 1) ./dump-io-irq-affinity $PCI_ID_OF_HBA
> > http://people.redhat.com/minlei/tests/tools/dump-io-irq-affinity
> > 
> 
> I had to hack this a bit for SAS HBA:
> 
> kernel version:
> Linux ubuntu 5.7.0-next-20200603-16498-gbfbfcda762d5 #405 SMP PREEMPT Thu
> Jun 4 14:19:49 BST 2020 aarch64 aarch64 aarch64 GNU/Linux
> -e 	irq 137, cpu list 16-19, effective list 16
> -e 	irq 138, cpu list 20-23, effective list 20
> -e 	irq 139, cpu list 24-27, effective list 24
> -e 	irq 140, cpu list 28-31, effective list 28
> -e 	irq 141, cpu list 32-35, effective list 32
> -e 	irq 142, cpu list 36-39, effective list 36
> -e 	irq 143, cpu list 40-43, effective list 40
> -e 	irq 144, cpu list 44-47, effective list 44
> -e 	irq 145, cpu list 48-51, effective list 48
> -e 	irq 146, cpu list 52-55, effective list 52
> -e 	irq 147, cpu list 56-59, effective list 56
> -e 	irq 148, cpu list 60-63, effective list 60
> -e 	irq 149, cpu list 0-3, effective list 0
> -e 	irq 150, cpu list 4-7, effective list 4
> -e 	irq 151, cpu list 8-11, effective list 8
> -e 	irq 152, cpu list 12-15, effective list 12
> 
> > 2) ./dump-qmap /dev/sdN
> > http://people.redhat.com/minlei/tests/tools/dump-qmap
> 
> 
> queue mapping for /dev/sda
> 	hctx0: default 16 17 18 19
> 	hctx1: default 20 21 22 23
> 	hctx2: default 24 25 26 27
> 	hctx3: default 28 29 30 31
> 	hctx4: default 32 33 34 35
> 	hctx5: default 36 37 38 39
> 	hctx6: default 40 41 42 43
> 	hctx7: default 44 45 46 47
> 	hctx8: default 48 49 50 51
> 	hctx9: default 52 53 54 55
> 	hctx10: default 56 57 58 59
> 	hctx11: default 60 61 62 63
> 	hctx12: default 0 1 2 3
> 	hctx13: default 4 5 6 7
> 	hctx14: default 8 9 10 11
> 	hctx15: default 12 13 14 15

OK, the queue mapping is correct.

As I mentioned in another thread, the real hw tag may be set as wrong.

You have to double check your cooked tag allocation algorithm and see if it
can work well when more requests than real hw queue depth are queued to hisi_sas,
and the correct way is to return SCSI_MLQUEUE_HOST_BUSY from .queuecommand().

Thanks,
Ming
John Garry June 5, 2020, 7:24 a.m. UTC | #15
On 05/06/2020 01:59, Ming Lei wrote:
>> 	hctx5: default 36 37 38 39
>> 	hctx6: default 40 41 42 43
>> 	hctx7: default 44 45 46 47
>> 	hctx8: default 48 49 50 51
>> 	hctx9: default 52 53 54 55
>> 	hctx10: default 56 57 58 59
>> 	hctx11: default 60 61 62 63
>> 	hctx12: default 0 1 2 3
>> 	hctx13: default 4 5 6 7
>> 	hctx14: default 8 9 10 11
>> 	hctx15: default 12 13 14 15
> OK, the queue mapping is correct.
> 
> As I mentioned in another thread, the real hw tag may be set as wrong.
> 

I doubt this.

And I think that you should also be able to add the same debug to 
blk_mq_hctx_notify_offline() to see that there may be still driver tags 
allocated to when all the scheduler tags are free'd for any test in your 
env.

> You have to double check your cooked tag allocation algorithm and see if it
> can work well when more requests than real hw queue depth are queued to hisi_sas,
> and the correct way is to return SCSI_MLQUEUE_HOST_BUSY from .queuecommand().

Yeah, the LLDD would just reject requests in that scenario and we would 
know about it from logs etc.

Anyway, I'll continue to check.

Thanks,
John
Ming Lei June 5, 2020, 7:31 a.m. UTC | #16
On Fri, Jun 05, 2020 at 08:24:50AM +0100, John Garry wrote:
> On 05/06/2020 01:59, Ming Lei wrote:
> > > 	hctx5: default 36 37 38 39
> > > 	hctx6: default 40 41 42 43
> > > 	hctx7: default 44 45 46 47
> > > 	hctx8: default 48 49 50 51
> > > 	hctx9: default 52 53 54 55
> > > 	hctx10: default 56 57 58 59
> > > 	hctx11: default 60 61 62 63
> > > 	hctx12: default 0 1 2 3
> > > 	hctx13: default 4 5 6 7
> > > 	hctx14: default 8 9 10 11
> > > 	hctx15: default 12 13 14 15
> > OK, the queue mapping is correct.
> > 
> > As I mentioned in another thread, the real hw tag may be set as wrong.
> > 
> 
> I doubt this.
> 
> And I think that you should also be able to add the same debug to
> blk_mq_hctx_notify_offline() to see that there may be still driver tags
> allocated to when all the scheduler tags are free'd for any test in your
> env.

No, that isn't possible, scheduler tag lifetime covers the whole request's
lifetime.

> 
> > You have to double check your cooked tag allocation algorithm and see if it
> > can work well when more requests than real hw queue depth are queued to hisi_sas,
> > and the correct way is to return SCSI_MLQUEUE_HOST_BUSY from .queuecommand().
> 
> Yeah, the LLDD would just reject requests in that scenario and we would know
> about it from logs etc.
> 
> Anyway, I'll continue to check.

The merged patch is much simpler than before, new request is prevented from being
allocated on the inactive hctx, then drain all in-flight requests on this hctx.

You need to check if the request is queued to hw correctly.

Thanks,
Ming
Ming Lei June 5, 2020, 8:33 a.m. UTC | #17
On Thu, Jun 04, 2020 at 01:45:09PM +0100, John Garry wrote:
> 
> > > That's your patch - ok, I can try.
> > > 
> 
> I still get timeouts and sometimes the same driver tag message occurs:
> 
>  1014.232417] run queue from wrong CPU 0, hctx active
> [ 1014.237692] run queue from wrong CPU 0, hctx active
> [ 1014.243014] run queue from wrong CPU 0, hctx active
> [ 1014.248370] run queue from wrong CPU 0, hctx active
> [ 1014.253725] run queue from wrong CPU 0, hctx active
> [ 1014.259252] run queue from wrong CPU 0, hctx active
> [ 1014.264492] run queue from wrong CPU 0, hctx active
> [ 1014.269453] irq_shutdown irq146
> [ 1014.272752] CPU55: shutdown
> [ 1014.275552] psci: CPU55 killed (polled 0 ms)
> [ 1015.151530] CPU56: shutdownr=1621MiB/s,w=0KiB/s][r=415k,w=0 IOPS][eta
> 00m:00s]
> [ 1015.154322] psci: CPU56 killed (polled 0 ms)
> [ 1015.184345] CPU57: shutdown
> [ 1015.187143] psci: CPU57 killed (polled 0 ms)
> [ 1015.223388] CPU58: shutdown
> [ 1015.226174] psci: CPU58 killed (polled 0 ms)
> long sleep 8
> [ 1045.234781] scsi_times_out req=0xffff041fa13e6300[r=0,w=0 IOPS][eta
> 04m:30s]
> 
> [...]
> 
> > > 
> > > I thought that if all the sched tags are put, then we should have no driver
> > > tag for that same hctx, right? That seems to coincide with the timeout (30
> > > seconds later)
> > 
> > That is weird, if there is driver tag found, that means the request is
> > in-flight and can't be completed by HW.
> 
> In blk_mq_hctx_has_requests(), we iterate the sched tags (when
> hctx->sched_tags is set). So can some requests not have a sched tag (even
> for scheduler set for the queue)?
> 
>  I assume you have integrated
> > global host tags patch in your test,
> 
> No, but the LLDD does not use request->tag - it generates its own.
> 
>  and suggest you to double check
> > hisi_sas's queue mapping which has to be exactly same with blk-mq's
> > mapping.
> > 
> 
> scheduler=none is ok, so I am skeptical of a problem there.

Please try the following patch, and we may not drain in-flight
requests correctly:

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 97bb650f0ed6..ae110e2754bf 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -265,6 +265,7 @@ struct bt_tags_iter_data {
 
 #define BT_TAG_ITER_RESERVED		(1 << 0)
 #define BT_TAG_ITER_STARTED		(1 << 1)
+#define BT_TAG_ITER_STATIC_RQS		(1 << 2)
 
 static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
@@ -280,7 +281,10 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assining ->rqs[].
 	 */
-	rq = tags->rqs[bitnr];
+	if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
+		rq = tags->static_rqs[bitnr];
+	else
+		rq = tags->rqs[bitnr];
 	if (!rq)
 		return true;
 	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
@@ -335,11 +339,13 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
  *		indicates whether or not @rq is a reserved request. Return
  *		true to continue iterating tags, false to stop.
  * @priv:	Will be passed as second argument to @fn.
+ *
+ * Caller has to pass the tag map from which requests are allocated.
  */
 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv)
 {
-	return __blk_mq_all_tag_iter(tags, fn, priv, 0);
+	return __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
 }
 
 /**


Thanks,
Ming
John Garry June 5, 2020, 9:27 a.m. UTC | #18
On 05/06/2020 09:33, Ming Lei wrote:
>> LLDD does not use request->tag - it generates its own.
>>
>>   and suggest you to double check
>>> hisi_sas's queue mapping which has to be exactly same with blk-mq's
>>> mapping.
>>>
>> scheduler=none is ok, so I am skeptical of a problem there.
> Please try the following patch, and we may not drain in-flight
> requests correctly:
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 97bb650f0ed6..ae110e2754bf 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -265,6 +265,7 @@ struct bt_tags_iter_data {
>   
>   #define BT_TAG_ITER_RESERVED		(1 << 0)
>   #define BT_TAG_ITER_STARTED		(1 << 1)
> +#define BT_TAG_ITER_STATIC_RQS		(1 << 2)
>   
>   static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>   {
> @@ -280,7 +281,10 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>   	 * We can hit rq == NULL here, because the tagging functions
>   	 * test and set the bit before assining ->rqs[].
>   	 */
> -	rq = tags->rqs[bitnr];
> +	if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
> +		rq = tags->static_rqs[bitnr];
> +	else
> +		rq = tags->rqs[bitnr];
>   	if (!rq)
>   		return true;
>   	if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
> @@ -335,11 +339,13 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
>    *		indicates whether or not @rq is a reserved request. Return
>    *		true to continue iterating tags, false to stop.
>    * @priv:	Will be passed as second argument to @fn.
> + *
> + * Caller has to pass the tag map from which requests are allocated.
>    */
>   void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
>   		void *priv)
>   {
> -	return __blk_mq_all_tag_iter(tags, fn, priv, 0);
> +	return __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
>   }
>   
>   /**
> 

ok, so early test shows that this is ok. I didn't try scheduler=none though.

I'll let you know.

Thanks,
John
John Garry June 5, 2020, 11:08 a.m. UTC | #19
On 05/06/2020 10:27, John Garry wrote:
> On 05/06/2020 09:33, Ming Lei wrote:
>>> LLDD does not use request->tag - it generates its own.
>>>
>>>   and suggest you to double check
>>>> hisi_sas's queue mapping which has to be exactly same with blk-mq's
>>>> mapping.
>>>>
>>> scheduler=none is ok, so I am skeptical of a problem there.
>> Please try the following patch, and we may not drain in-flight
>> requests correctly:
>>
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index 97bb650f0ed6..ae110e2754bf 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -265,6 +265,7 @@ struct bt_tags_iter_data {
>>   #define BT_TAG_ITER_RESERVED        (1 << 0)
>>   #define BT_TAG_ITER_STARTED        (1 << 1)
>> +#define BT_TAG_ITER_STATIC_RQS        (1 << 2)
>>   static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, 
>> void *data)
>>   {
>> @@ -280,7 +281,10 @@ static bool bt_tags_iter(struct sbitmap *bitmap, 
>> unsigned int bitnr, void *data)
>>        * We can hit rq == NULL here, because the tagging functions
>>        * test and set the bit before assining ->rqs[].

assigning

>>        */
>> -    rq = tags->rqs[bitnr];
>> +    if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
>> +        rq = tags->static_rqs[bitnr];
>> +    else
>> +        rq = tags->rqs[bitnr];
>>       if (!rq)
>>           return true;
>>       if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
>> @@ -335,11 +339,13 @@ static void __blk_mq_all_tag_iter(struct 
>> blk_mq_tags *tags,
>>    *        indicates whether or not @rq is a reserved request. Return
>>    *        true to continue iterating tags, false to stop.
>>    * @priv:    Will be passed as second argument to @fn.
>> + *
>> + * Caller has to pass the tag map from which requests are allocated.
>>    */
>>   void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn 
>> *fn,
>>           void *priv)
>>   {
>> -    return __blk_mq_all_tag_iter(tags, fn, priv, 0);
>> +    return __blk_mq_all_tag_iter(tags, fn, priv, 
>> BT_TAG_ITER_STATIC_RQS);
>>   }
>>   /**
>>
> 
> ok, so early test shows that this is ok. I didn't try scheduler=none 
> though.
> 

So that looks ok for scheduler=none also.

So can we please get both patches sent formally? (I was using 
Christoph's patch, which introduces__blk_mq_get_driver_tag()).

Cheers,
John

Ps. if sending a series, can you also fix up the spelling mistake? I 
don't think it's worth sending a single patch for that, which may conflict.

Patch
diff mbox series

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 96a39d0724a2..762198b62088 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -180,14 +180,6 @@  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
-	/*
-	 * Give up this allocation if the hctx is inactive.  The caller will
-	 * retry on an active hctx.
-	 */
-	if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
-		blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
-		return BLK_MQ_NO_TAG;
-	}
 	return tag + tag_offset;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a98a19353461..c5acf4858abf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -347,6 +347,24 @@  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	return rq;
 }
 
+static inline unsigned int blk_mq_get_request_tag(
+		struct blk_mq_alloc_data *data)
+{
+	/*
+	 * Waiting allocations only fail because of an inactive hctx.  In that
+	 * case just retry the hctx assignment and tag allocation as CPU hotplug
+	 * should have migrated us to an online CPU by now.
+	 */
+	int tag = blk_mq_get_tag(data);
+	if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state) &&
+				tag != BLK_MQ_NO_TAG)) {
+		blk_mq_put_tag(blk_mq_tags_from_data(data), data->ctx, tag);
+		tag = BLK_MQ_NO_TAG;
+	}
+
+	return tag;
+}
+
 static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 {
 	struct request_queue *q = data->q;
@@ -381,12 +399,7 @@  static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 	if (!(data->flags & BLK_MQ_REQ_INTERNAL))
 		blk_mq_tag_busy(data->hctx);
 
-	/*
-	 * Waiting allocations only fail because of an inactive hctx.  In that
-	 * case just retry the hctx assignment and tag allocation as CPU hotplug
-	 * should have migrated us to an online CPU by now.
-	 */
-	tag = blk_mq_get_tag(data);
+	tag = blk_mq_get_request_tag(data);
 	if (tag == BLK_MQ_NO_TAG) {
 		if (data->flags & BLK_MQ_REQ_NOWAIT)
 			return NULL;
@@ -480,7 +493,7 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		blk_mq_tag_busy(data.hctx);
 
 	ret = -EWOULDBLOCK;
-	tag = blk_mq_get_tag(&data);
+	tag = blk_mq_get_request_tag(&data);
 	if (tag == BLK_MQ_NO_TAG)
 		goto out_queue_exit;
 	return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);