@@ -3587,6 +3587,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_unquiesce_timeout(ns->queue);
+ up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_quiesce_timeout(ns->queue);
+ up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
@@ -412,6 +412,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
union nvme_result *res);
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
@@ -71,6 +71,7 @@ struct nvme_queue;
static void nvme_process_cq(struct nvme_queue *nvmeq);
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
freeze_queue);
+static int nvme_reset_dev(struct nvme_dev *dev);
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@@ -113,6 +114,23 @@ struct nvme_dev {
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
+
+ /* EH handler */
+ spinlock_t eh_lock;
+ bool ctrl_shutdown_started;
+ bool ctrl_failed;
+ unsigned int nested_eh;
+ struct work_struct fail_ctrl_work;
+ wait_queue_head_t eh_wq;
+ struct list_head eh_head;
+};
+
+#define NVME_MAX_NESTED_EH 32
+struct nvme_eh_work {
+ struct work_struct work;
+ struct nvme_dev *dev;
+ int seq;
+ struct list_head list;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1186,6 +1204,183 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
csts, result);
}
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+ dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
+
+ nvme_get_ctrl(&dev->ctrl);
+ nvme_dev_disable(dev, false, false);
+ if (!queue_work(nvme_wq, &dev->remove_work))
+ nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+ struct nvme_dev *dev =
+ container_of(work, struct nvme_dev, fail_ctrl_work);
+
+ dev_info(dev->ctrl.device, "EH: fail controller\n");
+ nvme_remove_dead_ctrl(dev, 0);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+ spin_lock(&dev->eh_lock);
+ dev->ctrl_shutdown_started = false;
+ spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_sched_fail_ctrl(struct nvme_dev *dev)
+{
+ INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+ queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+}
+
+/* either controller is updated to LIVE or will be removed */
+static bool nvme_eh_reset_done(struct nvme_dev *dev)
+{
+ return dev->ctrl.state == NVME_CTRL_LIVE ||
+ dev->ctrl.state == NVME_CTRL_ADMIN_ONLY ||
+ dev->ctrl_failed;
+}
+
+static void nvme_eh_done(struct nvme_eh_work *eh_work, int result)
+{
+ struct nvme_dev *dev = eh_work->dev;
+ bool top_eh;
+
+ spin_lock(&dev->eh_lock);
+ top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+
+ /* Fail controller if the top EH can't recover it */
+ if (!result)
+ wake_up_all(&dev->eh_wq);
+ else if (top_eh) {
+ dev->ctrl_failed = true;
+ nvme_eh_sched_fail_ctrl(dev);
+ wake_up_all(&dev->eh_wq);
+ }
+
+ list_del(&eh_work->list);
+ spin_unlock(&dev->eh_lock);
+
+ dev_info(dev->ctrl.device, "EH %d: state %d, eh_done %d, top eh %d\n",
+ eh_work->seq, dev->ctrl.state, result, top_eh);
+ wait_event(dev->eh_wq, nvme_eh_reset_done(dev));
+
+ /* release the EH seq, so outer EH can be allocated bigger seq No. */
+ spin_lock(&dev->eh_lock);
+ dev->nested_eh--;
+ spin_unlock(&dev->eh_lock);
+
+ /*
+ * After controller is recovered in upper EH finally, we have to
+ * unfreeze queues if reset failed in this EH, otherwise blk-mq
+ * queues' freeze counter may be leaked.
+ *
+ * nvme_unfreeze() can only be called after controller state is
+ * updated to LIVE.
+ */
+ if (result && (dev->ctrl.state == NVME_CTRL_LIVE ||
+ dev->ctrl.state == NVME_CTRL_ADMIN_ONLY))
+ nvme_unfreeze(&dev->ctrl);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+ struct nvme_eh_work *eh_work =
+ container_of(work, struct nvme_eh_work, work);
+ struct nvme_dev *dev = eh_work->dev;
+ int result = -ENODEV;
+ bool top_eh;
+
+ dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+ eh_work->seq);
+ nvme_dev_disable(dev, false, true);
+
+ /* allow new EH to be created */
+ nvme_eh_mark_ctrl_shutdown(dev);
+
+ /*
+ * nvme_dev_disable cancels all in-flight requests, and wont't
+ * cause timout at all, so I am always the top EH now, but it
+ * becomes not true after 'reset_lock' is held, so have to check
+ * if I am still the top EH, and force to update to NVME_CTRL_RESETTING
+ * if yes.
+ */
+ mutex_lock(&dev->ctrl.reset_lock);
+ spin_lock(&dev->eh_lock);
+
+ /* allow top EH to preempt other inner EH */
+ top_eh = list_is_last(&eh_work->list, &dev->eh_head);
+ dev_info(dev->ctrl.device, "EH %d: after shutdown, top eh: %d\n",
+ eh_work->seq, top_eh);
+ if (!top_eh) {
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+ spin_unlock(&dev->eh_lock);
+ goto done;
+ }
+ } else {
+ nvme_force_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
+ result = 0;
+ }
+
+ spin_unlock(&dev->eh_lock);
+ result = nvme_reset_dev(dev);
+done:
+ mutex_unlock(&dev->ctrl.reset_lock);
+ nvme_eh_done(eh_work, result);
+ dev_info(dev->ctrl.device, "EH %d: after recovery %d\n",
+ eh_work->seq, result);
+
+ kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+ bool need_sched = false;
+ bool fail_ctrl = false;
+ struct nvme_eh_work *eh_work;
+ int seq;
+
+ spin_lock(&dev->eh_lock);
+ if (!dev->ctrl_shutdown_started) {
+ need_sched = true;
+ seq = dev->nested_eh;
+ if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+ if (!dev->ctrl_failed)
+ dev->ctrl_failed = fail_ctrl = true;
+ else
+ need_sched = false;
+ } else
+ dev->ctrl_shutdown_started = true;
+ }
+ spin_unlock(&dev->eh_lock);
+
+ if (!need_sched)
+ return;
+
+ if (fail_ctrl) {
+ fail_ctrl:
+ nvme_eh_sched_fail_ctrl(dev);
+ return;
+ }
+
+ eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+ if (!eh_work)
+ goto fail_ctrl;
+
+ eh_work->dev = dev;
+ eh_work->seq = seq;
+
+ spin_lock(&dev->eh_lock);
+ list_add_tail(&eh_work->list, &dev->eh_head);
+ spin_unlock(&dev->eh_lock);
+
+ INIT_WORK(&eh_work->work, nvme_eh_work);
+ queue_work(nvme_reset_wq, &eh_work->work);
+}
+
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1207,9 +1402,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
*/
if (nvme_should_reset(dev, csts)) {
nvme_warn_reset(dev, csts);
- nvme_dev_disable(dev, false, true);
- nvme_reset_ctrl(&dev->ctrl);
- return BLK_EH_HANDLED;
+ nvme_eh_schedule(dev);
+ return BLK_EH_RESET_TIMER;
}
/*
@@ -1234,9 +1428,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false, false);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
- return BLK_EH_HANDLED;
+ nvme_eh_schedule(dev);
+ return BLK_EH_RESET_TIMER;
default:
break;
}
@@ -1250,15 +1444,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false, true);
- nvme_reset_ctrl(&dev->ctrl);
-
/*
* Mark the request as handled, since the inline shutdown
* forces all outstanding requests to complete.
*/
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
- return BLK_EH_HANDLED;
+ nvme_eh_schedule(dev);
+ return BLK_EH_RESET_TIMER;
}
if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2316,12 +2508,28 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
}
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
nvme_suspend_queue(&dev->queues[i]);
+ /*
+ * safe to sync timeout after queues are quiesced, then all
+ * requests(include the time-out ones) will be canceled.
+ */
+ nvme_quiesce_timeout(&dev->ctrl);
+ if (dev->ctrl.admin_q)
+ blk_quiesce_timeout(dev->ctrl.admin_q);
nvme_pci_disable(dev);
+ /*
+ * Both timeout and interrupt handler have been drained, and all
+ * in-flight requests will be canceled now.
+ */
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+ /* all requests have been canceled now, so enable timeout now */
+ nvme_unquiesce_timeout(&dev->ctrl);
+ if (dev->ctrl.admin_q)
+ blk_unquiesce_timeout(dev->ctrl.admin_q);
+
/*
* The driver will not be starting up queues again if shutting down so
* must flush all entered requests to their failed completion to avoid
@@ -2381,16 +2589,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
kfree(dev);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
-{
- dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
-
- nvme_get_ctrl(&dev->ctrl);
- nvme_dev_disable(dev, false, false);
- if (!queue_work(nvme_wq, &dev->remove_work))
- nvme_put_ctrl(&dev->ctrl);
-}
-
static int nvme_reset_dev(struct nvme_dev *dev)
{
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
@@ -2400,7 +2598,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
lockdep_assert_held(&dev->ctrl.reset_lock);
- if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+ if (dev->ctrl.state != NVME_CTRL_RESETTING)
goto out;
/*
@@ -2486,6 +2684,10 @@ static int nvme_reset_dev(struct nvme_dev *dev)
unfreeze_queue = true;
}
+ /* controller state may have been updated already by inner EH */
+ if (dev->ctrl.state == new_state)
+ goto reset_done;
+
result = -ENODEV;
/*
* If only admin queue live, keep it to do further investigation or
@@ -2499,6 +2701,7 @@ static int nvme_reset_dev(struct nvme_dev *dev)
nvme_start_ctrl(&dev->ctrl);
+ reset_done:
if (unfreeze_queue)
nvme_unfreeze(&dev->ctrl);
return 0;
@@ -2615,6 +2818,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
+static void nvme_eh_init(struct nvme_dev *dev)
+{
+ spin_lock_init(&dev->eh_lock);
+ init_waitqueue_head(&dev->eh_wq);
+ INIT_LIST_HEAD(&dev->eh_head);
+}
+
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
@@ -2659,6 +2869,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
+ nvme_eh_init(dev);
+
nvme_reset_ctrl(&dev->ctrl);
return 0;
When one req is timed out, now nvme_timeout() handles it by the following way: nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); return BLK_EH_HANDLED. There are several issues about the above approach: 1) IO may fail during resetting Admin IO timeout may be triggered in nvme_reset_dev() when error happens. Normal IO timeout may be triggered too during nvme_wait_freeze() in reset path. When the two kinds of timeout happen, the current reset mechanism can't work any more. 2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze() - nvme_dev_disable() and resetting controller are required for recovering controller, but the two are run from different contexts. nvme_start_freeze() is call from nvme_dev_disable() which is run timeout work context, and nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be triggered during resetting controller, so nvme_start_freeze() may be run several times. - Also two reset work may run one by one, this may cause hang in nvme_wait_freeze() forever too. 3) all namespace's EH require to shutdown & reset the controller block's timeout handler is per-request-queue, that means each namespace's error handling may shutdown & reset the whole controller, then the shutdown from one namespace may quiese queues when resetting from another namespace is in-progress. This patch fixes the above issues by using nested EH: 1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev) from one same EH context, so the above race 2) can be fixed easily. 2) always start a new context for handling EH, and cancel all in-flight requests(include the timed-out ones) in nvme_dev_disable() by quiescing timeout event before shutdown controller. 3) limit the max number of nested EH, when the limit is reached, removes the controller and fails all in-flight request. With this approach, blktest block/011 can be passed. Cc: James Smart <james.smart@broadcom.com> Cc: Jianchao Wang <jianchao.w.wang@oracle.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Sagi Grimberg <sagi@grimberg.me> Cc: linux-nvme@lists.infradead.org Cc: Laurence Oberman <loberman@redhat.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/nvme/host/core.c | 22 +++++ drivers/nvme/host/nvme.h | 2 + drivers/nvme/host/pci.c | 252 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 256 insertions(+), 20 deletions(-)