diff mbox series

[14/14] nvme: use blk_mq_[un]quiesce_tagset

Message ID 20221101150050.3510-15-hch@lst.de (mailing list archive)
State New, archived
Headers show
Series [01/14] block: set the disk capacity to 0 in blk_mark_disk_dead | expand

Commit Message

Christoph Hellwig Nov. 1, 2022, 3 p.m. UTC
From: Chao Leng <lengchao@huawei.com>

All controller namespaces share the same tagset, so we can use this
interface which does the optimal operation for parallel quiesce based on
the tagset type(e.g. blocking tagsets and non-blocking tagsets).

nvme connect_q should not be quiesced when quiesce tagset, so set the
QUEUE_FLAG_SKIP_TAGSET_QUIESCE to skip it when init connect_q.

Currently we use NVME_NS_STOPPED to ensure pairing quiescing and
unquiescing. If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be
invalided, so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.
In addition, we never really quiesce a single namespace. It is a better
choice to move the flag from ns to ctrl.

Signed-off-by: Chao Leng <lengchao@huawei.com>
[hch: rebased on top of prep patches]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chao Leng <lengchao@huawei.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 34 ++++++++--------------------------
 drivers/nvme/host/nvme.h |  2 +-
 2 files changed, 9 insertions(+), 27 deletions(-)

Comments

Chaitanya Kulkarni Nov. 2, 2022, 5:51 a.m. UTC | #1
On 11/1/22 08:00, Christoph Hellwig wrote:
> From: Chao Leng <lengchao@huawei.com>
> 
> All controller namespaces share the same tagset, so we can use this
> interface which does the optimal operation for parallel quiesce based on
> the tagset type(e.g. blocking tagsets and non-blocking tagsets).
> 
> nvme connect_q should not be quiesced when quiesce tagset, so set the
> QUEUE_FLAG_SKIP_TAGSET_QUIESCE to skip it when init connect_q.
> 
> Currently we use NVME_NS_STOPPED to ensure pairing quiescing and
> unquiescing. If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be
> invalided, so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.
> In addition, we never really quiesce a single namespace. It is a better
> choice to move the flag from ns to ctrl.
> 
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> [hch: rebased on top of prep patches]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Keith Busch <kbusch@kernel.org>
> Reviewed-by: Chao Leng <lengchao@huawei.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
> ---

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck
Shakeel Butt Nov. 21, 2022, 8:44 p.m. UTC | #2
On Tue, Nov 01, 2022 at 04:00:50PM +0100, Christoph Hellwig wrote:
> From: Chao Leng <lengchao@huawei.com>
> 
> All controller namespaces share the same tagset, so we can use this
> interface which does the optimal operation for parallel quiesce based on
> the tagset type(e.g. blocking tagsets and non-blocking tagsets).
> 
> nvme connect_q should not be quiesced when quiesce tagset, so set the
> QUEUE_FLAG_SKIP_TAGSET_QUIESCE to skip it when init connect_q.
> 
> Currently we use NVME_NS_STOPPED to ensure pairing quiescing and
> unquiescing. If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be
> invalided, so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.
> In addition, we never really quiesce a single namespace. It is a better
> choice to move the flag from ns to ctrl.
> 
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> [hch: rebased on top of prep patches]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Keith Busch <kbusch@kernel.org>
> Reviewed-by: Chao Leng <lengchao@huawei.com>
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

This patch is causing the following crash at the boot and reverting it
fixes the issue. This is next-20221121 kernel.

[   19.781883] BUG: kernel NULL pointer dereference, address: 0000000000000078
[   19.788761] #PF: supervisor write access in kernel mode
[   19.793924] #PF: error_code(0x0002) - not-present page
[   19.799006] PGD 0 P4D 0
[   19.801510] Oops: 0002 [#1] SMP PTI
[   19.804955] CPU: 54 PID: 110 Comm: kworker/u146:0 Tainted: G          I        6.1.0-next-20221121-smp-DEV #1
[   19.814753] Hardware name: ...
[   19.822662] Workqueue: nvme-reset-wq nvme_reset_work
[   19.827571] RIP: 0010:mutex_lock+0x13/0x30
[   19.831617] Code: 07 00 00 f7 c1 00 01 00 00 75 02 31 c0 5b 5d c3 00 00 cc cc 00 00 cc 0f 1f 44 00 00 55 48 89 e5 65 48 8b 0d 7f 9a c2 52 31 c0 <f0> 48 0f b1 0f 74 05 e8 11 00 00 00 5d c3 66 66 66 66 66 66 2e 0f
[   19.850172] RSP: 0000:ffff96624651bcb0 EFLAGS: 00010246
[   19.855339] RAX: 0000000000000000 RBX: 0000000000000003 RCX: ffff966246530000
[   19.862388] RDX: 0000000000000000 RSI: 0000000000000286 RDI: 0000000000000078
[   19.869442] RBP: ffff96624651bcb0 R08: 0000000000000004 R09: ffff96624651bc74
[   19.876495] R10: 0000000000000000 R11: 0000000000000000 R12: ffff966246570000
[   19.883547] R13: ffff966246570000 R14: ffff966249f330b0 R15: 0000000000000000
[   19.890599] FS:  0000000000000000(0000) GS:ffff9681bf880000(0000) knlGS:0000000000000000
[   19.898594] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   19.904273] CR2: 0000000000000078 CR3: 0000003c40e0a001 CR4: 00000000001706e0
[   19.911323] Call Trace:
[   19.913743]  <TASK>
[   19.915818]  blk_mq_quiesce_tagset+0x23/0xc0
[   19.920039]  nvme_stop_queues+0x1e/0x30
[   19.923829]  nvme_dev_disable+0xe1/0x3a0
[   19.927702]  nvme_reset_work+0x14cb/0x1600
[   19.931751]  ? ttwu_queue_wakelist+0xc4/0xd0
[   19.935970]  ? try_to_wake_up+0x1b4/0x2d0
[   19.939933]  process_one_work+0x1a4/0x320
[   19.943895]  worker_thread+0x241/0x3f0
[   19.947597]  ? worker_clr_flags+0x50/0x50
[   19.951560]  kthread+0x10d/0x120
[   19.954748]  ? kthread_blkcg+0x30/0x30
[   19.958453]  ret_from_fork+0x1f/0x30
[   19.961987]  </TASK>
[   19.964145] Modules linked in:
[   19.967655] gsmi: Log Shutdown Reason 0x03
[   19.971700] CR2: 0000000000000078
[   19.974978] ---[ end trace 0000000000000000 ]---
[   20.513708] RIP: 0010:mutex_lock+0x13/0x30
[   20.517757] Code: 07 00 00 f7 c1 00 01 00 00 75 02 31 c0 5b 5d c3 00 00 cc cc 00 00 cc 0f 1f 44 00 00 55 48 89 e5 65 48 8b 0d 7f 9a c2 52 31 c0 <f0> 48 0f b1 0f 74 05 e8 11 00 00 00 5d c3 66 66 66 66 66 66 2e 0f
[   20.536308] RSP: 0000:ffff96624651bcb0 EFLAGS: 00010246
[   20.541469] RAX: 0000000000000000 RBX: 0000000000000003 RCX: ffff966246530000
[   20.548519] RDX: 0000000000000000 RSI: 0000000000000286 RDI: 0000000000000078
[   20.555568] RBP: ffff96624651bcb0 R08: 0000000000000004 R09: ffff96624651bc74
[   20.562614] R10: 0000000000000000 R11: 0000000000000000 R12: ffff966246570000
[   20.569663] R13: ffff966246570000 R14: ffff966249f330b0 R15: 0000000000000000
[   20.576713] FS:  0000000000000000(0000) GS:ffff9681bf880000(0000) knlGS:0000000000000000
[   20.584704] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   20.590380] CR2: 0000000000000078 CR3: 0000003c40e0a001 CR4: 00000000001706e0
[   20.597432] Kernel panic - not syncing: Fatal exception
[   20.602654] Kernel Offset: 0x2b800000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[   20.613747] gsmi: Log Shutdown Reason 0x02
Chao Leng Nov. 22, 2022, 2:53 a.m. UTC | #3
On 2022/11/22 4:44, Shakeel Butt wrote:
> On Tue, Nov 01, 2022 at 04:00:50PM +0100, Christoph Hellwig wrote:
>> From: Chao Leng <lengchao@huawei.com>
>>
>> All controller namespaces share the same tagset, so we can use this
>> interface which does the optimal operation for parallel quiesce based on
>> the tagset type(e.g. blocking tagsets and non-blocking tagsets).
>>
>> nvme connect_q should not be quiesced when quiesce tagset, so set the
>> QUEUE_FLAG_SKIP_TAGSET_QUIESCE to skip it when init connect_q.
>>
>> Currently we use NVME_NS_STOPPED to ensure pairing quiescing and
>> unquiescing. If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be
>> invalided, so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.
>> In addition, we never really quiesce a single namespace. It is a better
>> choice to move the flag from ns to ctrl.
>>
>> Signed-off-by: Chao Leng <lengchao@huawei.com>
>> [hch: rebased on top of prep patches]
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>> Reviewed-by: Keith Busch <kbusch@kernel.org>
>> Reviewed-by: Chao Leng <lengchao@huawei.com>
>> Reviewed-by: Hannes Reinecke <hare@suse.de>
>> Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
> 
> This patch is causing the following crash at the boot and reverting it
> fixes the issue. This is next-20221121 kernel.
This patch can fix it.
https://lore.kernel.org/linux-nvme/20221116072711.1903536-1-hch@lst.de/
Christoph Hellwig Nov. 22, 2022, 6:08 a.m. UTC | #4
On Tue, Nov 22, 2022 at 10:53:17AM +0800, Chao Leng wrote:
>> This patch is causing the following crash at the boot and reverting it
>> fixes the issue. This is next-20221121 kernel.
> This patch can fix it.
> https://lore.kernel.org/linux-nvme/20221116072711.1903536-1-hch@lst.de/

Yes,  But I'm a little curious how it happened.  Shakeel, do you
have a genuine admin controller that does not have any I/O queues
to start with, or did we have other setup time errors?  Can youpost the
full dmesg?
Sagi Grimberg Nov. 22, 2022, 9:44 a.m. UTC | #5
>>> This patch is causing the following crash at the boot and reverting it
>>> fixes the issue. This is next-20221121 kernel.
>> This patch can fix it.
>> https://lore.kernel.org/linux-nvme/20221116072711.1903536-1-hch@lst.de/
> 
> Yes,  But I'm a little curious how it happened.  Shakeel, do you
> have a genuine admin controller that does not have any I/O queues
> to start with, or did we have other setup time errors?  Can youpost the
> full dmesg?

Is this after splitting reset from probe?

If not, I think that probe schedules nvme_reset, which first
disables the controller, calling nvme_stop_queues unconditionally,
i.e. before the tagset was allocated.
Shakeel Butt Nov. 22, 2022, 4:40 p.m. UTC | #6
On Mon, Nov 21, 2022 at 10:08 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Tue, Nov 22, 2022 at 10:53:17AM +0800, Chao Leng wrote:
> >> This patch is causing the following crash at the boot and reverting it
> >> fixes the issue. This is next-20221121 kernel.
> > This patch can fix it.
> > https://lore.kernel.org/linux-nvme/20221116072711.1903536-1-hch@lst.de/
>
> Yes,  But I'm a little curious how it happened.  Shakeel, do you
> have a genuine admin controller that does not have any I/O queues
> to start with, or did we have other setup time errors?  Can youpost the
> full dmesg?

Sorry, I don't know about the admin controller and its I/O queues. For
dmesg, it was an early crash, so not in /var/log/message. I was able
to get the crash dump through the remote serial console but there is
no easy way to get it to give full dmesg. I will see if I can get more
info.
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 66b0b6e110024..f94b05c585cbc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4895,6 +4895,8 @@  int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
 			ret = PTR_ERR(ctrl->connect_q);
 			goto out_free_tag_set;
 		}
+		blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
+				   ctrl->connect_q);
 	}
 
 	ctrl->tagset = set;
@@ -5096,20 +5098,6 @@  int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
-static void nvme_start_ns_queue(struct nvme_ns *ns)
-{
-	if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
-		blk_mq_unquiesce_queue(ns->queue);
-}
-
-static void nvme_stop_ns_queue(struct nvme_ns *ns)
-{
-	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
-		blk_mq_quiesce_queue(ns->queue);
-	else
-		blk_mq_wait_quiesce_done(ns->queue->tag_set);
-}
-
 /* let I/O to all namespaces fail in preparation for surprise removal */
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
 {
@@ -5172,23 +5160,17 @@  EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		nvme_stop_ns_queue(ns);
-	up_read(&ctrl->namespaces_rwsem);
+	if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+		blk_mq_quiesce_tagset(ctrl->tagset);
+	else
+		blk_mq_wait_quiesce_done(ctrl->tagset);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
 
 void nvme_start_queues(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		nvme_start_ns_queue(ns);
-	up_read(&ctrl->namespaces_rwsem);
+	if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+		blk_mq_unquiesce_tagset(ctrl->tagset);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1cd7168002438..f9df10653f3c5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -237,6 +237,7 @@  enum nvme_ctrl_flags {
 	NVME_CTRL_FAILFAST_EXPIRED	= 0,
 	NVME_CTRL_ADMIN_Q_STOPPED	= 1,
 	NVME_CTRL_STARTED_ONCE		= 2,
+	NVME_CTRL_STOPPED		= 3,
 };
 
 struct nvme_ctrl {
@@ -486,7 +487,6 @@  struct nvme_ns {
 #define NVME_NS_ANA_PENDING	2
 #define NVME_NS_FORCE_RO	3
 #define NVME_NS_READY		4
-#define NVME_NS_STOPPED		5
 
 	struct cdev		cdev;
 	struct device		cdev_device;