Message ID | 20230620133711.22840-5-dwagner@suse.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | nvme-fc: Fix blktests hangers | expand |
Hi Daniel, kernel test robot noticed the following build warnings: https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Daniel-Wagner/nvme-fc-Do-not-wait-in-vain-when-unloading-module/20230620-213849 base: linus/master patch link: https://lore.kernel.org/r/20230620133711.22840-5-dwagner%40suse.de patch subject: [PATCH v2 4/5] nvme-fc: Make initial connect attempt synchronous config: openrisc-randconfig-m041-20230622 (https://download.01.org/0day-ci/archive/20230624/202306240125.U2jdrjAY-lkp@intel.com/config) compiler: or1k-linux-gcc (GCC) 12.3.0 reproduce: (https://download.01.org/0day-ci/archive/20230624/202306240125.U2jdrjAY-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org> | Closes: https://lore.kernel.org/r/202306240125.U2jdrjAY-lkp@intel.com/ smatch warnings: drivers/nvme/host/fc.c:3590 nvme_fc_init_ctrl() warn: passing zero to 'ERR_PTR' vim +/ERR_PTR +3590 drivers/nvme/host/fc.c 61bff8ef008845 James Smart 2017-04-23 3533 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); 61bff8ef008845 James Smart 2017-04-23 3534 if (ret) 98e3528012cd57 Ross Lagerwall 2023-01-20 3535 goto out_free_queues; e399441de9115c James Smart 2016-12-02 3536 61bff8ef008845 James Smart 2017-04-23 3537 /* at this point, teardown path changes to ref counting on nvme ctrl */ e399441de9115c James Smart 2016-12-02 3538 98e3528012cd57 Ross Lagerwall 2023-01-20 3539 ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, 98e3528012cd57 Ross Lagerwall 2023-01-20 3540 &nvme_fc_admin_mq_ops, 98e3528012cd57 Ross Lagerwall 2023-01-20 3541 struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 98e3528012cd57 Ross Lagerwall 2023-01-20 3542 ctrl->lport->ops->fcprqst_priv_sz)); 98e3528012cd57 Ross Lagerwall 2023-01-20 3543 if (ret) 98e3528012cd57 Ross Lagerwall 2023-01-20 3544 goto fail_ctrl; 98e3528012cd57 Ross Lagerwall 2023-01-20 3545 e399441de9115c James Smart 2016-12-02 3546 spin_lock_irqsave(&rport->lock, flags); e399441de9115c James Smart 2016-12-02 3547 list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); e399441de9115c James Smart 2016-12-02 3548 spin_unlock_irqrestore(&rport->lock, flags); e399441de9115c James Smart 2016-12-02 3549 ac881fd1288ca6 Daniel Wagner 2023-06-20 3550 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 4c984154efa131 James Smart 2018-06-13 3551 dev_err(ctrl->ctrl.device, 4c984154efa131 James Smart 2018-06-13 3552 "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); 4c984154efa131 James Smart 2018-06-13 3553 goto fail_ctrl; No error code on this path. Originally it didn't matter because it was hardcoded to return ERR_PTR(-EIO); 17c4dc6eb7e1b2 James Smart 2017-10-09 3554 } 17c4dc6eb7e1b2 James Smart 2017-10-09 3555 ac881fd1288ca6 Daniel Wagner 2023-06-20 3556 ret = nvme_fc_create_association(ctrl); ac881fd1288ca6 Daniel Wagner 2023-06-20 3557 if (ret) 4c984154efa131 James Smart 2018-06-13 3558 goto fail_ctrl; 4c984154efa131 James Smart 2018-06-13 3559 4c984154efa131 James Smart 2018-06-13 3560 dev_info(ctrl->ctrl.device, 4c984154efa131 James Smart 2018-06-13 3561 "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", e5ea42faa773c6 Hannes Reinecke 2021-09-22 3562 ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl)); 4c984154efa131 James Smart 2018-06-13 3563 4c984154efa131 James Smart 2018-06-13 3564 return &ctrl->ctrl; 4c984154efa131 James Smart 2018-06-13 3565 4c984154efa131 James Smart 2018-06-13 3566 fail_ctrl: 19fce0470f0503 James Smart 2020-12-01 3567 cancel_work_sync(&ctrl->ioerr_work); cf25809bec2c7d James Smart 2018-03-13 3568 cancel_work_sync(&ctrl->ctrl.reset_work); cf25809bec2c7d James Smart 2018-03-13 3569 cancel_delayed_work_sync(&ctrl->connect_work); cf25809bec2c7d James Smart 2018-03-13 3570 de41447aac034c Ewan D. Milne 2017-04-24 3571 ctrl->ctrl.opts = NULL; 17c4dc6eb7e1b2 James Smart 2017-10-09 3572 61bff8ef008845 James Smart 2017-04-23 3573 /* initiate nvme ctrl ref counting teardown */ e399441de9115c James Smart 2016-12-02 3574 nvme_uninit_ctrl(&ctrl->ctrl); 61bff8ef008845 James Smart 2017-04-23 3575 0b5a7669a457dd James Smart 2017-06-15 3576 /* Remove core ctrl ref. */ 0b5a7669a457dd James Smart 2017-06-15 3577 nvme_put_ctrl(&ctrl->ctrl); 0b5a7669a457dd James Smart 2017-06-15 3578 61bff8ef008845 James Smart 2017-04-23 3579 /* as we're past the point where we transition to the ref 61bff8ef008845 James Smart 2017-04-23 3580 * counting teardown path, if we return a bad pointer here, 61bff8ef008845 James Smart 2017-04-23 3581 * the calling routine, thinking it's prior to the 61bff8ef008845 James Smart 2017-04-23 3582 * transition, will do an rport put. Since the teardown 61bff8ef008845 James Smart 2017-04-23 3583 * path also does a rport put, we do an extra get here to 61bff8ef008845 James Smart 2017-04-23 3584 * so proper order/teardown happens. 61bff8ef008845 James Smart 2017-04-23 3585 */ 61bff8ef008845 James Smart 2017-04-23 3586 nvme_fc_rport_get(rport); 61bff8ef008845 James Smart 2017-04-23 3587 ac881fd1288ca6 Daniel Wagner 2023-06-20 3588 if (ret > 0) ac881fd1288ca6 Daniel Wagner 2023-06-20 3589 ret = -EIO; ac881fd1288ca6 Daniel Wagner 2023-06-20 @3590 return ERR_PTR(ret); e399441de9115c James Smart 2016-12-02 3591 61bff8ef008845 James Smart 2017-04-23 3592 out_free_queues: 61bff8ef008845 James Smart 2017-04-23 3593 kfree(ctrl->queues); e399441de9115c James Smart 2016-12-02 3594 out_free_ida: 61bff8ef008845 James Smart 2017-04-23 3595 put_device(ctrl->dev); 3dd83f4013f0e8 Sagi Grimberg 2022-02-14 3596 ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); e399441de9115c James Smart 2016-12-02 3597 out_free_ctrl: e399441de9115c James Smart 2016-12-02 3598 kfree(ctrl); e399441de9115c James Smart 2016-12-02 3599 out_fail: e399441de9115c James Smart 2016-12-02 3600 /* exit via here doesn't follow ctlr ref points */ e399441de9115c James Smart 2016-12-02 3601 return ERR_PTR(ret); e399441de9115c James Smart 2016-12-02 3602 }
On Tue, Jun 20, 2023 at 03:37:10PM +0200, Daniel Wagner wrote: > Commit 4c984154efa1 ("nvme-fc: change controllers first connect to use > reconnect path") made the connection attempt asynchronous in order to > make the connection attempt from autoconnect/boot via udev/systemd up > case a bit more reliable. > > Unfortunately, one side effect of this is that any wrong parameters > provided from userspace will not be directly reported as invalid, e.g. > auth keys. > > So instead having the policy code inside the kernel it's better to > address this in userspace, for example in nvme-cli or nvme-stas. > > This aligns the fc transport with tcp and rdma. > > Signed-off-by: Daniel Wagner <dwagner@suse.de> > --- > drivers/nvme/host/fc.c | 19 ++++++++----------- > 1 file changed, 8 insertions(+), 11 deletions(-) > > diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c > index 472ed285fd45..aa2911f07c6c 100644 > --- a/drivers/nvme/host/fc.c > +++ b/drivers/nvme/host/fc.c > @@ -2943,6 +2943,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) > /* force put free routine to ignore io queues */ > ctrl->ctrl.tagset = NULL; > > + if (ret > 0) > + ret = -EIO; All these checks for ret > 0 make me unhappy. I don't understand how they are a part of the commit. I have tried to look at the context and I think maybe you are working around the fact that qla_nvme_ls_req() returns QLA_FUNCTION_FAILED on error. Also the qla_nvme_ls_req() function EINVAL on error. I just wrote a commit message saying that none of the callers cared but I missed that apparently gets returned to nvme_fc_init_ctrl(). :/ https://lore.kernel.org/all/49866d28-4cfe-47b0-842b-78f110e61aab@moroto.mountain/ Let's just fix qla_nvme_ls_req() instead of working around it here. And let's add a WARN_ON_ONCE() somewhere to prevent future bugs. regards, dan carpenter
On Mon, Jun 26, 2023 at 02:33:18PM +0300, Dan Carpenter wrote: > > @@ -2943,6 +2943,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) > > /* force put free routine to ignore io queues */ > > ctrl->ctrl.tagset = NULL; > > > > + if (ret > 0) > > + ret = -EIO; > > All these checks for ret > 0 make me unhappy. I don't understand how > they are a part of the commit. We have two types of error message types in the nvme subsystem. The negative values are the usual ones and positive ones are nvme protocol errors. For example if the authentication fails because of invalid credentials when doing the authentication nvmf_connect_admin_queue() will return a value of NVME_SC_AUTH_REQUIRED. This is also the value which gets propagated to this point here. The problem is any positive error code is interpreted as a valid pointer later in the code, which results in a crash. > I have tried to look at the context and I think maybe you are working > around the fact that qla_nvme_ls_req() returns QLA_FUNCTION_FAILED on > error. The auth blktests are exercising the error path here and that's why I added this check. BTW, we already use in other places, this is not completely new in this subsystem. > Also the qla_nvme_ls_req() function EINVAL on error. I just wrote a > commit message saying that none of the callers cared but I missed that > apparently gets returned to nvme_fc_init_ctrl(). :/ > https://lore.kernel.org/all/49866d28-4cfe-47b0-842b-78f110e61aab@moroto.mountain/ Thank! > Let's just fix qla_nvme_ls_req() instead of working around it here. > > And let's add a WARN_ON_ONCE() somewhere to prevent future bugs. This makes sense for the driver APIs. Though for the core nvme subsystem this needs to be discusses/redesigned how to handle the protocol errors first. Thanks, Daniel
On 6/27/23 08:18, Daniel Wagner wrote: > On Mon, Jun 26, 2023 at 02:33:18PM +0300, Dan Carpenter wrote: >>> @@ -2943,6 +2943,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) >>> /* force put free routine to ignore io queues */ >>> ctrl->ctrl.tagset = NULL; >>> >>> + if (ret > 0) >>> + ret = -EIO; >> >> All these checks for ret > 0 make me unhappy. I don't understand how >> they are a part of the commit. > > We have two types of error message types in the nvme subsystem. The negative > values are the usual ones and positive ones are nvme protocol errors. > > For example if the authentication fails because of invalid credentials when > doing the authentication nvmf_connect_admin_queue() will return a value of > NVME_SC_AUTH_REQUIRED. This is also the value which gets propagated to this > point here. The problem is any positive error code is interpreted as a valid > pointer later in the code, which results in a crash. > >> I have tried to look at the context and I think maybe you are working >> around the fact that qla_nvme_ls_req() returns QLA_FUNCTION_FAILED on >> error. > > The auth blktests are exercising the error path here and that's why I added > this check. BTW, we already use in other places, this is not completely new in > this subsystem. > >> Also the qla_nvme_ls_req() function EINVAL on error. I just wrote a >> commit message saying that none of the callers cared but I missed that >> apparently gets returned to nvme_fc_init_ctrl(). :/ >> https://lore.kernel.org/all/49866d28-4cfe-47b0-842b-78f110e61aab@moroto.mountain/ > > Thank! > >> Let's just fix qla_nvme_ls_req() instead of working around it here. >> >> And let's add a WARN_ON_ONCE() somewhere to prevent future bugs. > > This makes sense for the driver APIs. Though for the core nvme subsystem this > needs to be discusses/redesigned how to handle the protocol errors first. > I would stick with the 'normal' nvme syntax of having negative errors as internal errors (ie errnos), '0' for no error, and positive numbers as NVMe protocol errors. As such I would also advocate to not map NVMe protocol errors onto error numbers but rather fix the callers to not do a pointer conversion. Cheers, Hannes
On 6/27/23 08:39, Hannes Reinecke wrote: > On 6/27/23 08:18, Daniel Wagner wrote: >> On Mon, Jun 26, 2023 at 02:33:18PM +0300, Dan Carpenter wrote: >>>> @@ -2943,6 +2943,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl >>>> *ctrl) >>>> /* force put free routine to ignore io queues */ >>>> ctrl->ctrl.tagset = NULL; >>>> + if (ret > 0) >>>> + ret = -EIO; >>> >>> All these checks for ret > 0 make me unhappy. I don't understand how >>> they are a part of the commit. >> >> We have two types of error message types in the nvme subsystem. The >> negative >> values are the usual ones and positive ones are nvme protocol errors. >> >> For example if the authentication fails because of invalid credentials >> when >> doing the authentication nvmf_connect_admin_queue() will return a >> value of >> NVME_SC_AUTH_REQUIRED. This is also the value which gets propagated to >> this >> point here. The problem is any positive error code is interpreted as a >> valid >> pointer later in the code, which results in a crash. >> >>> I have tried to look at the context and I think maybe you are working >>> around the fact that qla_nvme_ls_req() returns QLA_FUNCTION_FAILED on >>> error. >> >> The auth blktests are exercising the error path here and that's why I >> added >> this check. BTW, we already use in other places, this is not >> completely new in >> this subsystem. >> >>> Also the qla_nvme_ls_req() function EINVAL on error. I just wrote a >>> commit message saying that none of the callers cared but I missed that >>> apparently gets returned to nvme_fc_init_ctrl(). :/ >>> https://lore.kernel.org/all/49866d28-4cfe-47b0-842b-78f110e61aab@moroto.mountain/ >> >> Thank! >> >>> Let's just fix qla_nvme_ls_req() instead of working around it here. >>> >>> And let's add a WARN_ON_ONCE() somewhere to prevent future bugs. >> >> This makes sense for the driver APIs. Though for the core nvme >> subsystem this >> needs to be discusses/redesigned how to handle the protocol errors first. >> > I would stick with the 'normal' nvme syntax of having negative errors as > internal errors (ie errnos), '0' for no error, and positive numbers as > NVMe protocol errors. > As such I would also advocate to not map NVMe protocol errors onto error > numbers but rather fix the callers to not do a pointer conversion. > Aw. Now I see it. It's the ->create_ctrl() callback which will always return a controller pointer or an error value. If we were to return a protocol error we would need to stick it into the controller structure itself. But if we doing that then we'll be ending up with a non-existing controller, ie we'd be returning a structure for a dead controller. Not exactly pretty, but it would allow us to improve the userland API to return the NVMe protocol error by reading from the fabrics device; the controller structure itself would be cleaned up when closing that device. Hmm. Cheers, Hannes
On 6/20/2023 6:37 AM, Daniel Wagner wrote: > Commit 4c984154efa1 ("nvme-fc: change controllers first connect to use > reconnect path") made the connection attempt asynchronous in order to > make the connection attempt from autoconnect/boot via udev/systemd up > case a bit more reliable. > > Unfortunately, one side effect of this is that any wrong parameters > provided from userspace will not be directly reported as invalid, e.g. > auth keys. > > So instead having the policy code inside the kernel it's better to > address this in userspace, for example in nvme-cli or nvme-stas. > > This aligns the fc transport with tcp and rdma. As much as you want to make this change to make transports "similar", I am dead set against it unless you are completing a long qualification of the change on real FC hardware and FC-NVME devices. There is probably 1.5 yrs of testing of different race conditions that drove this change. You cannot declare success from a simplistic toy tool such as fcloop for validation. The original issues exist, probably have even morphed given the time from the original change, and this will seriously disrupt the transport and any downstream releases. So I have a very strong NACK on this change. Yes - things such as the connect failure results are difficult to return back to nvme-cli. I have had many gripes about the nvme-cli's behavior over the years, especially on negative cases due to race conditions which required retries. It still fails this miserably. The async reconnect path solved many of these issues for fc. For the auth failure, how do we deal with things if auth fails over time as reconnects fail due to a credential changes ? I would think commonality of this behavior drives part of the choice. -- james
Hi James, On Sat, Jul 01, 2023 at 05:11:11AM -0700, James Smart wrote: > As much as you want to make this change to make transports "similar", I am > dead set against it unless you are completing a long qualification of the > change on real FC hardware and FC-NVME devices. There is probably 1.5 yrs of > testing of different race conditions that drove this change. You cannot > declare success from a simplistic toy tool such as fcloop for validation. > > The original issues exist, probably have even morphed given the time from > the original change, and this will seriously disrupt the transport and any > downstream releases. So I have a very strong NACK on this change. > > Yes - things such as the connect failure results are difficult to return > back to nvme-cli. I have had many gripes about the nvme-cli's behavior over > the years, especially on negative cases due to race conditions which > required retries. It still fails this miserably. The async reconnect path > solved many of these issues for fc. > > For the auth failure, how do we deal with things if auth fails over time as > reconnects fail due to a credential changes ? I would think commonality of > this behavior drives part of the choice. Alright, what do you think about the idea to introduce a new '--sync' option to nvme-cli which forwards this info to the kernel that we want to wait for the initial connect to succeed or fail? Obviously, this needs to handle signals too. From what I understood this is also what Ewan would like to have. Hannes thought it would make sense to use the same initial connect logic in tcp/rdma, because there could also be transient erros (e.g. spanning tree protocol). In short making the tcp/rdma do the same thing as fc? So let's drop the final patch from this series for the time. Could you give some feedback on the rest of the patches? Thanks, Daniel
On 7/6/2023 5:07 AM, Daniel Wagner wrote: > Hi James, > > On Sat, Jul 01, 2023 at 05:11:11AM -0700, James Smart wrote: >> As much as you want to make this change to make transports "similar", I am >> dead set against it unless you are completing a long qualification of the >> change on real FC hardware and FC-NVME devices. There is probably 1.5 yrs of >> testing of different race conditions that drove this change. You cannot >> declare success from a simplistic toy tool such as fcloop for validation. >> >> The original issues exist, probably have even morphed given the time from >> the original change, and this will seriously disrupt the transport and any >> downstream releases. So I have a very strong NACK on this change. >> >> Yes - things such as the connect failure results are difficult to return >> back to nvme-cli. I have had many gripes about the nvme-cli's behavior over >> the years, especially on negative cases due to race conditions which >> required retries. It still fails this miserably. The async reconnect path >> solved many of these issues for fc. >> >> For the auth failure, how do we deal with things if auth fails over time as >> reconnects fail due to a credential changes ? I would think commonality of >> this behavior drives part of the choice. > > Alright, what do you think about the idea to introduce a new '--sync' option to > nvme-cli which forwards this info to the kernel that we want to wait for the > initial connect to succeed or fail? Obviously, this needs to handle signals too. > > From what I understood this is also what Ewan would like to have To me this is not sync vs non-sync option, it's a max_reconnects value tested for in nvmf_should_reconnect(). Which, if set to 0 (or 1), should fail if the initial connect fails. Right now max_reconnects is calculated by the ctrl_loss_tmo and reconnect_delay. So there's already a way via the cli to make sure there's only 1 connect attempt. I wouldn't mind seeing an exact cli option that sets it to 1 connection attempt w/o the user calculation and 2 value specification. I also assume that this is not something that would be set by default in the auto-connect scripts or automated cli startup scripts. > > Hannes thought it would make sense to use the same initial connect logic in > tcp/rdma, because there could also be transient erros (e.g. spanning tree > protocol). In short making the tcp/rdma do the same thing as fc? I agree that the same connect logic makes sense for tcp/rdma. Certainly one connect/teardown path vs one at create and one at reconnect makes sense. The transient errors during 1st connect was the why FC added it and I would assume tcp/rdma has it's own transient errors or timing relationships at initial connection setups, etc. For FC, we're trying to work around errors to transport commands (FC NVME ELS's) that fail (dropped or timeout) or commands used to initialize the controller which may be dropped/timeout thus fail controller init. Although NVMe-FC does have a retransmission option, it generally doesn't apply to the FC NVME LS's, and few of the FC devices have yet to turn on the retransmission option to deal with the errors. So the general behavior is connection termination and/or association termination which then depends on the reconnect path to retry. It's also critical as connection requests are automated on FC based on connectivity events. If we fail out to the cli due to the fabric dropping some up front command, there's no guarantee there will be another connectivity event to restart the controller create and we end up without device connectivity. The other issue we had to deal with was how long sysadm hung out waiting for the auto-connect script to complete. We couldn't wait the entire multiple retry case, and returning before the 1st attempt was complete was against the spirit of the cli - so we waited for the 1st attempt to try, released sysadm and let the reconnect go on in the background. > > So let's drop the final patch from this series for the time. Could you give some > feedback on the rest of the patches? > > Thanks, > Daniel I'll look at them. -- james
On 7/12/23 00:47, James Smart wrote: > On 7/6/2023 5:07 AM, Daniel Wagner wrote: >> Hi James, >> >> On Sat, Jul 01, 2023 at 05:11:11AM -0700, James Smart wrote: >>> As much as you want to make this change to make transports "similar", >>> I am dead set against it unless you are completing a long qualification >>> of the change on real FC hardware and FC-NVME devices. There is probably >>> 1.5 yrs of testing of different race conditions that drove this change. >>> You cannot declare success from a simplistic toy tool such as fcloop for >>> validation. >>> >>> The original issues exist, probably have even morphed given the time >>> from >>> the original change, and this will seriously disrupt the transport >>> and any >>> downstream releases. So I have a very strong NACK on this change. >>> >>> Yes - things such as the connect failure results are difficult to return >>> back to nvme-cli. I have had many gripes about the nvme-cli's >>> behavior over >>> the years, especially on negative cases due to race conditions which >>> required retries. It still fails this miserably. The async reconnect >>> path >>> solved many of these issues for fc. >>> >>> For the auth failure, how do we deal with things if auth fails over >>> time as >>> reconnects fail due to a credential changes ? I would think >>> commonality of >>> this behavior drives part of the choice. >> >> Alright, what do you think about the idea to introduce a new '--sync' >> option to >> nvme-cli which forwards this info to the kernel that we want to wait >> for the >> initial connect to succeed or fail? Obviously, this needs to handle >> signals too. >> >> From what I understood this is also what Ewan would like to have > To me this is not sync vs non-sync option, it's a max_reconnects value > tested for in nvmf_should_reconnect(). Which, if set to 0 (or 1), should > fail if the initial connect fails. > Well, this is more a technical detail while we continue to harp about 'sync' vs 'non-sync'. Currently all instances of ->create_ctrl() are running asynchronously, ie ->create_ctrl() returns a 'ctrl' object which is still in the process of establishing the connection. (And there it doesn't really matter whether it's FC or TCP/RDMA; FC is kicking of a workqueue for the 'reconnect' call, whereas TCP/RDMA is creating the association and issues the actual 'connect' NVMe SQE via an I/O workqueue; net result is identical). And when we talk about 'sync' connect we are planning to _wait_ until this asynchronous operation reaches a steady state, ie either after the connect attempts succeeded or after the connect retries are exhausted. And yes, we _are_ aware that this might be a quite long time. > Right now max_reconnects is calculated by the ctrl_loss_tmo and > reconnect_delay. So there's already a way via the cli to make sure > there's only 1 connect attempt. I wouldn't mind seeing an exact cli > option that sets it to 1 connection attempt w/o the user calculation and > 2 value specification. > Again, we do _not_ propose to change any of the default settings. The 'sync' option will not modify the reconnect settings, it will just wait until a steady state it reached. > I also assume that this is not something that would be set by default in > the auto-connect scripts or automated cli startup scripts. > You assume correctly. That's why it'll be an additional option. Cheers, Hannes
The basic issue I am trying to solve with NVMe/FC is that we cannot get systems to boot reliably from NVMe/FC (boot from SAN) because we don't know how long the connect is going to take, and we have seen spurious failures in our testing. In general, I think the whole business of a userspace syscall() -> asynchronous work in the kernel is problematic because the syscall can return a good status even if the connect is never going to work due to a condition that is not going to clear. It is difficult and cumbersome to script around this reliably. So what I am saying is the current mechanism doesn't work completely right anyway. We also encounter the problem Daniel is addressing trying to get blktests and other internal tests to work, for similar reasons. -Ewan On Thu, Jul 6, 2023 at 8:07 AM Daniel Wagner <dwagner@suse.de> wrote: > > Hi James, > > On Sat, Jul 01, 2023 at 05:11:11AM -0700, James Smart wrote: > > As much as you want to make this change to make transports "similar", I am > > dead set against it unless you are completing a long qualification of the > > change on real FC hardware and FC-NVME devices. There is probably 1.5 yrs of > > testing of different race conditions that drove this change. You cannot > > declare success from a simplistic toy tool such as fcloop for validation. > > > > The original issues exist, probably have even morphed given the time from > > the original change, and this will seriously disrupt the transport and any > > downstream releases. So I have a very strong NACK on this change. > > > > Yes - things such as the connect failure results are difficult to return > > back to nvme-cli. I have had many gripes about the nvme-cli's behavior over > > the years, especially on negative cases due to race conditions which > > required retries. It still fails this miserably. The async reconnect path > > solved many of these issues for fc. > > > > For the auth failure, how do we deal with things if auth fails over time as > > reconnects fail due to a credential changes ? I would think commonality of > > this behavior drives part of the choice. > > Alright, what do you think about the idea to introduce a new '--sync' option to > nvme-cli which forwards this info to the kernel that we want to wait for the > initial connect to succeed or fail? Obviously, this needs to handle signals too. > > From what I understood this is also what Ewan would like to have. > > Hannes thought it would make sense to use the same initial connect logic in > tcp/rdma, because there could also be transient erros (e.g. spanning tree > protocol). In short making the tcp/rdma do the same thing as fc? > > So let's drop the final patch from this series for the time. Could you give some > feedback on the rest of the patches? > > Thanks, > Daniel >
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 472ed285fd45..aa2911f07c6c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2943,6 +2943,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) /* force put free routine to ignore io queues */ ctrl->ctrl.tagset = NULL; + if (ret > 0) + ret = -EIO; return ret; } @@ -3545,21 +3547,15 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || - !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); goto fail_ctrl; } - if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: failed to schedule initial connect\n", - ctrl->cnum); + ret = nvme_fc_create_association(ctrl); + if (ret) goto fail_ctrl; - } - - flush_delayed_work(&ctrl->connect_work); dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", @@ -3568,7 +3564,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return &ctrl->ctrl; fail_ctrl: - nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); cancel_work_sync(&ctrl->ioerr_work); cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); @@ -3590,7 +3585,9 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, */ nvme_fc_rport_get(rport); - return ERR_PTR(-EIO); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); out_free_queues: kfree(ctrl->queues);
Commit 4c984154efa1 ("nvme-fc: change controllers first connect to use reconnect path") made the connection attempt asynchronous in order to make the connection attempt from autoconnect/boot via udev/systemd up case a bit more reliable. Unfortunately, one side effect of this is that any wrong parameters provided from userspace will not be directly reported as invalid, e.g. auth keys. So instead having the policy code inside the kernel it's better to address this in userspace, for example in nvme-cli or nvme-stas. This aligns the fc transport with tcp and rdma. Signed-off-by: Daniel Wagner <dwagner@suse.de> --- drivers/nvme/host/fc.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-)