diff mbox

[V2,5/5] nvme: pci: simplify timeout handling

Message ID 20180429154152.16656-6-ming.lei@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ming Lei April 29, 2018, 3:41 p.m. UTC
When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

which may introduces the following issues:

1) the following timeout on other reqs may call nvme_dev_disable()
again, which may quiesce queue again when resetting is in-progress,
then finally nothing can move on.

2) when timeout is triggered in reset work function, nvme_wait_freeze()
may wait forever because now controller can't be recovered at all

This patch fixes the issues by:

1) handle timeout event in one EH thread, and wakeup this thread if
controller recovery is needed

2) Inside the EH handler, timeout work is drained by nvme_unquiesce_timeout()
before canceling in-flight requrests, so all requests can be canceled or
completed by either EH or block timeout handler.

3) Moves the draining IO and unfreezing queues into one post_eh work context,
so controller can be recovered always.

This patch fixes reports from the horible test of block/011.

Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c |  22 ++++++++
 drivers/nvme/host/nvme.h |   3 +
 drivers/nvme/host/pci.c  | 140 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 155 insertions(+), 10 deletions(-)

Comments

jianchao.wang May 2, 2018, 2:23 a.m. UTC | #1
Hi Ming

On 04/29/2018 11:41 PM, Ming Lei wrote:
> +
>  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -1197,8 +1297,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	 */
>  	if (nvme_should_reset(dev, csts)) {
>  		nvme_warn_reset(dev, csts);
> -		nvme_dev_disable(dev, false);
> -		nvme_reset_ctrl(&dev->ctrl);
> +		nvme_eh_schedule(dev);
>  		return BLK_EH_HANDLED;
>  	}
>  
> @@ -1224,8 +1323,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  		dev_warn(dev->ctrl.device,
>  			 "I/O %d QID %d timeout, disable controller\n",
>  			 req->tag, nvmeq->qid);
> -		nvme_dev_disable(dev, false);
>  		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> +		nvme_eh_schedule(dev);
>  		return BLK_EH_HANDLED;
>  	default:
>  		break;
> @@ -1240,14 +1339,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  		dev_warn(dev->ctrl.device,
>  			 "I/O %d QID %d timeout, reset controller\n",
>  			 req->tag, nvmeq->qid);
> -		nvme_dev_disable(dev, false);
> -		nvme_reset_ctrl(&dev->ctrl);
> -
>  		/*
>  		 * Mark the request as handled, since the inline shutdown
>  		 * forces all outstanding requests to complete.
>  		 */
>  		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> +		nvme_eh_schedule(dev);
>  		return BLK_EH_HANDLED;
>  	}
>  
> @@ -2246,8 +2343,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  	if (pci_is_enabled(pdev)) {
>  		u32 csts = readl(dev->bar + NVME_REG_CSTS);
>  
> -		if (dev->ctrl.state == NVME_CTRL_LIVE ||
> -		    dev->ctrl.state == NVME_CTRL_RESETTING) {
> +		if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE ||
> +		    dev->ctrl.state == NVME_CTRL_RESETTING)) {
>  			nvme_start_freeze(&dev->ctrl);
>  			frozen = true;
>  		}
> @@ -2281,11 +2378,23 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
>  		nvme_suspend_queue(&dev->queues[i]);
>  
> +	/* safe to sync timeout after queues are quiesced */
> +	nvme_quiesce_timeout(&dev->ctrl);
> +	blk_quiesce_timeout(dev->ctrl.admin_q);
> +
>  	nvme_pci_disable(dev);
>  
> +	/*
> +	 * Both timeout and interrupt handler have been drained, and all
> +	 * in-flight requests will be canceled now.
> +	 */
>  	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
>  	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);


We need to return BLK_EH_RESET_TIMER in nvme_timeout then:
1. defer the completion. we can't unmap the io request before close the controller totally, so not BLK_EH_HANDLED.
2. nvme_cancel_request could complete it. blk_mq_complete_request is invoked by nvme_cancel_request, so not BLK_EH_NOT_HANDLED.

Thanks
Jianchao
Ming Lei May 2, 2018, 4:54 a.m. UTC | #2
On Wed, May 02, 2018 at 10:23:20AM +0800, jianchao.wang wrote:
> Hi Ming
> 
> On 04/29/2018 11:41 PM, Ming Lei wrote:
> > +
> >  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  {
> >  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> > @@ -1197,8 +1297,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  	 */
> >  	if (nvme_should_reset(dev, csts)) {
> >  		nvme_warn_reset(dev, csts);
> > -		nvme_dev_disable(dev, false);
> > -		nvme_reset_ctrl(&dev->ctrl);
> > +		nvme_eh_schedule(dev);
> >  		return BLK_EH_HANDLED;
> >  	}
> >  
> > @@ -1224,8 +1323,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  		dev_warn(dev->ctrl.device,
> >  			 "I/O %d QID %d timeout, disable controller\n",
> >  			 req->tag, nvmeq->qid);
> > -		nvme_dev_disable(dev, false);
> >  		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> > +		nvme_eh_schedule(dev);
> >  		return BLK_EH_HANDLED;
> >  	default:
> >  		break;
> > @@ -1240,14 +1339,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  		dev_warn(dev->ctrl.device,
> >  			 "I/O %d QID %d timeout, reset controller\n",
> >  			 req->tag, nvmeq->qid);
> > -		nvme_dev_disable(dev, false);
> > -		nvme_reset_ctrl(&dev->ctrl);
> > -
> >  		/*
> >  		 * Mark the request as handled, since the inline shutdown
> >  		 * forces all outstanding requests to complete.
> >  		 */
> >  		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> > +		nvme_eh_schedule(dev);
> >  		return BLK_EH_HANDLED;
> >  	}
> >  
> > @@ -2246,8 +2343,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
> >  	if (pci_is_enabled(pdev)) {
> >  		u32 csts = readl(dev->bar + NVME_REG_CSTS);
> >  
> > -		if (dev->ctrl.state == NVME_CTRL_LIVE ||
> > -		    dev->ctrl.state == NVME_CTRL_RESETTING) {
> > +		if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE ||
> > +		    dev->ctrl.state == NVME_CTRL_RESETTING)) {
> >  			nvme_start_freeze(&dev->ctrl);
> >  			frozen = true;
> >  		}
> > @@ -2281,11 +2378,23 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
> >  	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
> >  		nvme_suspend_queue(&dev->queues[i]);
> >  
> > +	/* safe to sync timeout after queues are quiesced */
> > +	nvme_quiesce_timeout(&dev->ctrl);
> > +	blk_quiesce_timeout(dev->ctrl.admin_q);
> > +
> >  	nvme_pci_disable(dev);
> >  
> > +	/*
> > +	 * Both timeout and interrupt handler have been drained, and all
> > +	 * in-flight requests will be canceled now.
> > +	 */
> >  	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
> >  	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
> 
> 
> We need to return BLK_EH_RESET_TIMER in nvme_timeout then:
> 1. defer the completion. we can't unmap the io request before close the controller totally, so not BLK_EH_HANDLED.
> 2. nvme_cancel_request could complete it. blk_mq_complete_request is invoked by nvme_cancel_request, so not BLK_EH_NOT_HANDLED.

We don't need to change return value of .timeout() any more after
calling nvme_quiesce_timeout():

Thanks the single EH kthread, once nvme_quiesce_timeout() returns, all
timed-out requests have been handled already. Some of them may be completed,
and others may be handled as RESET_TIMER, but none of them are handled as
NOT_HANDLED because nvme_timeout() won't return that value.

So the following blk_mq_tagset_busy_iter(nvme_cancel_request) can handle
all in-flight requests because timeout is drained and quiesced.

Thanks,
Ming
jianchao.wang May 2, 2018, 5:12 a.m. UTC | #3
Hi Ming

On 05/02/2018 12:54 PM, Ming Lei wrote:
>> We need to return BLK_EH_RESET_TIMER in nvme_timeout then:
>> 1. defer the completion. we can't unmap the io request before close the controller totally, so not BLK_EH_HANDLED.
>> 2. nvme_cancel_request could complete it. blk_mq_complete_request is invoked by nvme_cancel_request, so not BLK_EH_NOT_HANDLED.
> We don't need to change return value of .timeout() any more after
> calling nvme_quiesce_timeout():
> 
> Thanks the single EH kthread, once nvme_quiesce_timeout() returns, all
> timed-out requests have been handled already. Some of them may be completed,
> and others may be handled as RESET_TIMER, but none of them are handled as
> NOT_HANDLED because nvme_timeout() won't return that value.
> 
> So the following blk_mq_tagset_busy_iter(nvme_cancel_request) can handle
> all in-flight requests because timeout is drained and quiesced.

The key point here is we cannot unmap the io requests before we close the controller directly.
The nvme controller may still hold the command after we complete and unmap the io request in nvme_timeout.
This will cause memory corruption.

So we cannot just schedule the eh recovery kthread then return BLK_EH_HANDLED.
We need to deffer the completion of timeout requests until the controller has been closed totally in nvme_dev_disable.

Thanks
Jianchao
Ming Lei May 2, 2018, 7:27 a.m. UTC | #4
On Wed, May 02, 2018 at 01:12:57PM +0800, jianchao.wang wrote:
> Hi Ming
> 
> On 05/02/2018 12:54 PM, Ming Lei wrote:
> >> We need to return BLK_EH_RESET_TIMER in nvme_timeout then:
> >> 1. defer the completion. we can't unmap the io request before close the controller totally, so not BLK_EH_HANDLED.
> >> 2. nvme_cancel_request could complete it. blk_mq_complete_request is invoked by nvme_cancel_request, so not BLK_EH_NOT_HANDLED.
> > We don't need to change return value of .timeout() any more after
> > calling nvme_quiesce_timeout():
> > 
> > Thanks the single EH kthread, once nvme_quiesce_timeout() returns, all
> > timed-out requests have been handled already. Some of them may be completed,
> > and others may be handled as RESET_TIMER, but none of them are handled as
> > NOT_HANDLED because nvme_timeout() won't return that value.
> > 
> > So the following blk_mq_tagset_busy_iter(nvme_cancel_request) can handle
> > all in-flight requests because timeout is drained and quiesced.
> 
> The key point here is we cannot unmap the io requests before we close the controller directly.
> The nvme controller may still hold the command after we complete and unmap the io request in nvme_timeout.
> This will cause memory corruption.
> 
> So we cannot just schedule the eh recovery kthread then return BLK_EH_HANDLED.
> We need to deffer the completion of timeout requests until the controller has been closed totally in nvme_dev_disable.
> 

Good catch!

I am gonna document this point since it is easy to be ignored.

Yeah, looks it is simpler to return BLK_EH_RESET_TIMER for NVMe to
handle this issue, but not too readable, it is something like:
we has stopped the timeout, but BLK_EH_RESET_TIMER still has to be
returned for asking block layer to not complete this req.

So IMO it may be better to return BLK_EH_NOT_HANDLED, but it becomes
a bit tricky to handle this request in EH thread, maybe a NVMe req
flag is needed for completing the req in nvme_cancel_request().

Thanks
Ming
diff mbox

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f9028873298e..664a268accd1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3582,6 +3582,28 @@  void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a19b7f04ac24..956ee19ff403 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,6 +19,7 @@ 
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
+#include <linux/blkdev.h>
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
@@ -405,6 +406,8 @@  int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8172ee584130..86c14d9051ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -29,6 +29,7 @@ 
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sed-opal.h>
+#include <linux/kthread.h>
 
 #include "nvme.h"
 
@@ -112,6 +113,11 @@  struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	spinlock_t	  eh_lock;
+	bool		eh_in_recovery;
+	struct task_struct    *ehandler;
+	struct work_struct post_eh_work;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1176,6 +1182,100 @@  static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	if (!dev->eh_in_recovery) {
+		dev->eh_in_recovery = true;
+		wake_up_process(dev->ehandler);
+	}
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_done(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	if (dev->eh_in_recovery)
+		dev->eh_in_recovery = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static int nvme_error_handler(void *data)
+{
+	struct nvme_dev *dev = data;
+
+	while (true) {
+		/*
+		 * The sequence in kthread_stop() sets the stop flag first
+		 * then wakes the process.  To avoid missed wakeups, the task
+		 * should always be in a non running state before the stop
+		 * flag is checked
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		spin_lock(&dev->eh_lock);
+		if (!dev->eh_in_recovery) {
+			spin_unlock(&dev->eh_lock);
+			schedule();
+			continue;
+		}
+		spin_unlock(&dev->eh_lock);
+
+		__set_current_state(TASK_RUNNING);
+
+		dev_info(dev->ctrl.device, "start eh recovery\n");
+
+		/* freeze queues before recovery */
+		nvme_start_freeze(&dev->ctrl);
+
+		nvme_dev_disable(dev, false);
+
+		/*
+		 * reset won't wait for IO completion any more, so it
+		 * is safe to reset controller in sync way
+		 */
+		nvme_reset_ctrl_sync(&dev->ctrl);
+
+		dev_info(dev->ctrl.device, "eh recovery done\n");
+
+		/*
+		 * drain IO & unfreeze queues in another context because
+		 * these IOs may trigger timeout again
+		 */
+		queue_work(nvme_reset_wq, &dev->post_eh_work);
+	}
+	__set_current_state(TASK_RUNNING);
+	dev->ehandler = NULL;
+
+	return 0;
+}
+
+static void nvme_post_eh_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, post_eh_work);
+
+	nvme_wait_freeze(&dev->ctrl);
+	nvme_unfreeze(&dev->ctrl);
+}
+
+static int nvme_eh_init(struct nvme_dev *dev)
+{
+	spin_lock_init(&dev->eh_lock);
+	INIT_WORK(&dev->post_eh_work, nvme_post_eh_work);
+
+	dev->ehandler = kthread_run(nvme_error_handler, dev,
+			"nvme_eh_%d", dev->ctrl.instance);
+	if (IS_ERR(dev->ehandler)) {
+		dev_err(dev->ctrl.device, "error handler thread failed to spawn, error = %ld\n",
+			PTR_ERR(dev->ehandler));
+		return PTR_ERR(dev->ehandler);
+	}
+	return 0;
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1197,8 +1297,7 @@  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false);
-		nvme_reset_ctrl(&dev->ctrl);
+		nvme_eh_schedule(dev);
 		return BLK_EH_HANDLED;
 	}
 
@@ -1224,8 +1323,8 @@  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		nvme_eh_schedule(dev);
 		return BLK_EH_HANDLED;
 	default:
 		break;
@@ -1240,14 +1339,12 @@  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		nvme_eh_schedule(dev);
 		return BLK_EH_HANDLED;
 	}
 
@@ -2246,8 +2343,8 @@  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	if (pci_is_enabled(pdev)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+		if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE ||
+		    dev->ctrl.state == NVME_CTRL_RESETTING)) {
 			nvme_start_freeze(&dev->ctrl);
 			frozen = true;
 		}
@@ -2281,11 +2378,23 @@  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
 
+	/* safe to sync timeout after queues are quiesced */
+	nvme_quiesce_timeout(&dev->ctrl);
+	blk_quiesce_timeout(dev->ctrl.admin_q);
+
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests are canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2416,9 +2525,14 @@  static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	nvme_eh_done(dev);
+
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
 	 * any working I/O queue.
+	 *
+	 * Now we won't wait for IO completion here any more, and sync reset
+	 * can be used safely anywhere.
 	 */
 	if (dev->online_queues < 2) {
 		dev_warn(dev->ctrl.device, "IO queues not created\n");
@@ -2427,11 +2541,9 @@  static void nvme_reset_work(struct work_struct *work)
 		new_state = NVME_CTRL_ADMIN_ONLY;
 	} else {
 		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
 		/* hit this only when allocate tagset fails */
 		if (nvme_dev_add(dev))
 			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
 	}
 
 	/*
@@ -2449,6 +2561,7 @@  static void nvme_reset_work(struct work_struct *work)
 
  out:
 	nvme_remove_dead_ctrl(dev, result);
+	nvme_eh_done(dev);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2590,10 +2703,15 @@  static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	if (nvme_eh_init(dev))
+		goto uninit_ctrl;
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
 
+ uninit_ctrl:
+	nvme_uninit_ctrl(&dev->ctrl);
  release_pools:
 	nvme_release_prp_pools(dev);
  unmap:
@@ -2647,6 +2765,8 @@  static void nvme_remove(struct pci_dev *pdev)
 	nvme_stop_ctrl(&dev->ctrl);
 	nvme_remove_namespaces(&dev->ctrl);
 	nvme_dev_disable(dev, true);
+	if (dev->ehandler)
+		kthread_stop(dev->ehandler);
 	nvme_free_host_mem(dev);
 	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);