[6/8] nvme-tcp-offload: Add controller level error recovery implementation

Message ID	20210602184246.14184-7-smalin@marvell.com (mailing list archive)
State	Accepted
Commit	5faf6d68554808ef1b9c8647a7e8fd8a4e8cb0a5
Delegated to:	Netdev Maintainers
Headers	show Return-Path: <netdev-owner@kernel.org> From: Shai Malin <smalin@marvell.com> To: <linux-nvme@lists.infradead.org>, <sagi@grimberg.me>, <hch@lst.de>, <axboe@fb.com>, <kbusch@kernel.org> CC: <netdev@vger.kernel.org>, <davem@davemloft.net>, <kuba@kernel.org>, <aelior@marvell.com>, <mkalderon@marvell.com>, <okulkarni@marvell.com>, <pkushwaha@marvell.com>, <prabhakar.pkin@gmail.com>, <malin1024@gmail.com>, <smalin@marvell.com>, Arie Gershberg <agershberg@marvell.com> Subject: [PATCH 6/8] nvme-tcp-offload: Add controller level error recovery implementation Date: Wed, 2 Jun 2021 21:42:44 +0300 Message-ID: <20210602184246.14184-7-smalin@marvell.com> In-Reply-To: <20210602184246.14184-1-smalin@marvell.com> References: <20210602184246.14184-1-smalin@marvell.com> MIME-Version: 1.0 Content-Type: text/plain Precedence: bulk
Series	NVMeTCP Offload ULP \| expand [0/8] NVMeTCP Offload ULP [1/8] nvme-tcp-offload: Add nvme-tcp-offload - NVMeTCP HW offload ULP [2/8] nvme-fabrics: Move NVMF_ALLOWED_OPTS and NVMF_REQUIRED_OPTS definitions [3/8] nvme-fabrics: Expose nvmf_check_required_opts() globally [4/8] nvme-tcp-offload: Add device scan implementation [5/8] nvme-tcp-offload: Add controller level implementation [6/8] nvme-tcp-offload: Add controller level error recovery implementation [7/8] nvme-tcp-offload: Add queue level implementation [8/8] nvme-tcp-offload: Add IO level implementation

Context	Check	Description
netdev/tree_selection	success	Not a local patch

diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c index d05dec9c0add..97c1fd33adb9 100644 --- a/drivers/nvme/host/tcp-offload.c +++ b/drivers/nvme/host/tcp-offload.c @@ -72,6 +72,23 @@ void nvme_tcp_ofld_unregister_dev(struct nvme_tcp_ofld_dev *dev) } EXPORT_SYMBOL_GPL(nvme_tcp_ofld_unregister_dev); +/** + * nvme_tcp_ofld_error_recovery() - NVMeTCP Offload library error recovery. + * function. + * @nctrl: NVMe controller instance to change to resetting. + * + * API function that change the controller state to resseting. + * Part of the overall controller reset sequence. + */ +void nvme_tcp_ofld_error_recovery(struct nvme_ctrl *nctrl) +{ + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_RESETTING)) + return; + + queue_work(nvme_reset_wq, &to_tcp_ofld_ctrl(nctrl)->err_work); +} +EXPORT_SYMBOL_GPL(nvme_tcp_ofld_error_recovery); + /** * nvme_tcp_ofld_report_queue_err() - NVMeTCP Offload report error event * callback function. Pointed to by nvme_tcp_ofld_queue->report_err. @@ -82,7 +99,8 @@ EXPORT_SYMBOL_GPL(nvme_tcp_ofld_unregister_dev); */ int nvme_tcp_ofld_report_queue_err(struct nvme_tcp_ofld_queue *queue) { - /* Placeholder - invoke error recovery flow */ + pr_err("nvme-tcp-offload queue error\n"); + nvme_tcp_ofld_error_recovery(&queue->ctrl->nctrl); return 0; } @@ -287,6 +305,28 @@ nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new) return rc; } +static void nvme_tcp_ofld_reconnect_or_remove(struct nvme_ctrl *nctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (nctrl->state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(nctrl->state == NVME_CTRL_NEW || + nctrl->state == NVME_CTRL_LIVE); + + return; + } + + if (nvmf_should_reconnect(nctrl)) { + dev_info(nctrl->device, "Reconnecting in %d seconds...\n", + nctrl->opts->reconnect_delay); + queue_delayed_work(nvme_wq, + &to_tcp_ofld_ctrl(nctrl)->connect_work, + nctrl->opts->reconnect_delay * HZ); + } else { + dev_info(nctrl->device, "Removing controller...\n"); + nvme_delete_ctrl(nctrl); + } +} + static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new) { struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); @@ -410,10 +450,63 @@ nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove) /* Placeholder - teardown_io_queues */ } +static void nvme_tcp_ofld_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_tcp_ofld_ctrl *ctrl = + container_of(to_delayed_work(work), + struct nvme_tcp_ofld_ctrl, + connect_work); + struct nvme_ctrl *nctrl = &ctrl->nctrl; + + ++nctrl->nr_reconnects; + + if (nvme_tcp_ofld_setup_ctrl(nctrl, false)) + goto requeue; + + dev_info(nctrl->device, "Successfully reconnected (%d attempt)\n", + nctrl->nr_reconnects); + + nctrl->nr_reconnects = 0; + + return; + +requeue: + dev_info(nctrl->device, "Failed reconnect attempt %d\n", + nctrl->nr_reconnects); + nvme_tcp_ofld_reconnect_or_remove(nctrl); +} + +static void nvme_tcp_ofld_error_recovery_work(struct work_struct *work) +{ + struct nvme_tcp_ofld_ctrl *ctrl = + container_of(work, struct nvme_tcp_ofld_ctrl, err_work); + struct nvme_ctrl *nctrl = &ctrl->nctrl; + + nvme_stop_keep_alive(nctrl); + nvme_tcp_ofld_teardown_io_queues(nctrl, false); + /* unquiesce to fail fast pending requests */ + nvme_start_queues(nctrl); + nvme_tcp_ofld_teardown_admin_queue(nctrl, false); + blk_mq_unquiesce_queue(nctrl->admin_q); + + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started nctrl delete */ + WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING && + nctrl->state != NVME_CTRL_DELETING_NOIO); + + return; + } + + nvme_tcp_ofld_reconnect_or_remove(nctrl); +} + static void nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown) { - /* Placeholder - err_work and connect_work */ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->connect_work); nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown); blk_mq_quiesce_queue(nctrl->admin_q); if (shutdown) @@ -428,6 +521,32 @@ static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl) nvme_tcp_ofld_teardown_ctrl(nctrl, true); } +static void nvme_tcp_ofld_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *nctrl = + container_of(work, struct nvme_ctrl, reset_work); + + nvme_stop_ctrl(nctrl); + nvme_tcp_ofld_teardown_ctrl(nctrl, false); + + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING && + nctrl->state != NVME_CTRL_DELETING_NOIO); + + return; + } + + if (nvme_tcp_ofld_setup_ctrl(nctrl, false)) + goto out_fail; + + return; + +out_fail: + ++nctrl->nr_reconnects; + nvme_tcp_ofld_reconnect_or_remove(nctrl); +} + static int nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set, struct request *rq, @@ -521,6 +640,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts) opts->nr_poll_queues + 1; nctrl->sqsize = opts->queue_size - 1; nctrl->kato = opts->kato; + INIT_DELAYED_WORK(&ctrl->connect_work, + nvme_tcp_ofld_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_tcp_ofld_error_recovery_work); + INIT_WORK(&nctrl->reset_work, nvme_tcp_ofld_reset_ctrl_work); if (!(opts->mask & NVMF_OPT_TRSVCID)) { opts->trsvcid = kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); diff --git a/drivers/nvme/host/tcp-offload.h b/drivers/nvme/host/tcp-offload.h index 520a0ea6f4b8..4281d1dacc94 100644 --- a/drivers/nvme/host/tcp-offload.h +++ b/drivers/nvme/host/tcp-offload.h @@ -197,3 +197,4 @@ struct nvme_tcp_ofld_ops { /* Exported functions for lower vendor specific offload drivers */ int nvme_tcp_ofld_register_dev(struct nvme_tcp_ofld_dev *dev); void nvme_tcp_ofld_unregister_dev(struct nvme_tcp_ofld_dev *dev); +void nvme_tcp_ofld_error_recovery(struct nvme_ctrl *nctrl);

[6/8] nvme-tcp-offload: Add controller level error recovery implementation

Checks

Commit Message

Patch