@@ -45,9 +45,10 @@ static unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
-static u8 nvme_max_retries = 5;
+u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
+EXPORT_SYMBOL_GPL(nvme_max_retries);
static unsigned long default_ps_max_latency_us = 100000;
module_param(default_ps_max_latency_us, ulong, 0644);
@@ -25,6 +25,8 @@ extern unsigned int nvme_io_timeout;
extern unsigned int admin_timeout;
#define ADMIN_TIMEOUT (admin_timeout * HZ)
+extern u8 nvme_max_retries;
+
#define NVME_DEFAULT_KATO 5
#define NVME_KATO_GRACE 10
@@ -24,6 +24,7 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sed-opal.h>
#include <linux/pci-p2pdma.h>
+#include <linux/delay.h>
#include "trace.h"
#include "nvme.h"
@@ -106,6 +107,7 @@ struct nvme_dev {
unsigned long bar_mapped_size;
struct work_struct remove_work;
struct mutex shutdown_lock;
+ int successive_shutdown;
bool subsystem;
u64 cmb_size;
bool cmb_use_sqes;
@@ -1243,9 +1245,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* shutdown, so we return BLK_EH_DONE.
*/
switch (dev->ctrl.state) {
- case NVME_CTRL_CONNECTING:
- nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
- /* fall through */
case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
@@ -2383,11 +2382,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
struct pci_dev *pdev = to_pci_dev(dev->dev);
mutex_lock(&dev->shutdown_lock);
+ dev->successive_shutdown++;
if (pci_is_enabled(pdev)) {
u32 csts = readl(dev->bar + NVME_REG_CSTS);
if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING) {
+ dev->ctrl.state == NVME_CTRL_RESETTING ||
+ dev->ctrl.state == NVME_CTRL_CONNECTING) {
freeze = true;
nvme_start_freeze(&dev->ctrl);
}
@@ -2498,12 +2499,34 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
nvme_put_ctrl(&dev->ctrl);
}
+static bool nvme_wait_freeze_and_check(struct nvme_dev *dev)
+{
+ bool frozen;
+
+ while (true) {
+ frozen = nvme_frozen(&dev->ctrl);
+ if (frozen)
+ break;
+
+ /*
+ * controller may has been shutdown because of new timeout, so
+ * break from the loop and report the event to reset handler.
+ */
+ if (!dev->online_queues)
+ break;
+ msleep(3);
+ }
+
+ return frozen;
+}
+
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
+ bool reset_done = true;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
result = -ENODEV;
@@ -2600,23 +2623,34 @@ static void nvme_reset_work(struct work_struct *work)
nvme_free_tagset(dev);
} else {
nvme_start_queues(&dev->ctrl);
- nvme_wait_freeze(&dev->ctrl);
- nvme_dev_add(dev);
+ reset_done = nvme_wait_freeze_and_check(dev);
+ if (reset_done)
+ nvme_dev_add(dev);
nvme_unfreeze(&dev->ctrl);
}
/*
* If only admin queue live, keep it to do further investigation or
* recovery.
+ *
+ * Or we can't move on after retrying enough times.
*/
- if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE) ||
+ (!reset_done && dev->successive_shutdown >= nvme_max_retries)) {
dev_warn(dev->ctrl.device,
"failed to mark controller live state\n");
result = -ENODEV;
goto out;
}
- nvme_start_ctrl(&dev->ctrl);
+ /* New error happens during reset, so schedule a new reset */
+ if (!reset_done) {
+ dev_warn(dev->ctrl.device, "new error during reset\n");
+ nvme_reset_ctrl(&dev->ctrl);
+ } else {
+ dev->successive_shutdown = 0;
+ nvme_start_ctrl(&dev->ctrl);
+ }
return;
out_unlock:
During waiting for in-flight IO completion in reset handler, timeout or controller failure still may happen, then the controller is deleted and all inflight IOs are failed. This way is too violent: 1) the timeout may be triggered exactly during nvme_wait_freeze() after controller is recovered by one occasional event; 2) the claimed retry times isn't respected. Improve the reset handling by replacing nvme_wait_freeze with query & check controller. If all ns queues are frozen, the controller is reset successfully, otherwise check and see if the controller has been disabled. If yes, break from the current recovery and schedule a fresh new reset. This way avoids to failing IO & removing controller unnecessarily. Cc: Christoph Hellwig <hch@lst.de> Cc: Sagi Grimberg <sagi@grimberg.me> Cc: Keith Busch <kbusch@kernel.org> Cc: Max Gurtovoy <maxg@mellanox.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 50 +++++++++++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 9 deletions(-)