Message ID | 20240718181319.145884-1-kheib@redhat.com (mailing list archive) |
---|---|
State | Awaiting Upstream |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | [iwl-next,v2] i40e: Add support for fw health report | expand |
Dear Kamal, Thank you for your patch. Am 18.07.24 um 20:13 schrieb Kamal Heib: > Add support for reporting fw status via the devlink health report. I do not know the interface. So, the driver already has some way to communicate with the firmware, and you only need to hook it up to devlink? > Example: > # devlink health show pci/0000:02:00.0 reporter fw > pci/0000:02:00.0: > reporter fw > state healthy error 0 recover 0 > # devlink health diagnose pci/0000:02:00.0 reporter fw > Mode: normal Any way to force it into recovery mode to also check it? > Signed-off-by: Kamal Heib <kheib@redhat.com> > --- > v2: > - Address comments from Jiri. > - Move the creation of the health report. > --- > drivers/net/ethernet/intel/i40e/i40e.h | 1 + > .../net/ethernet/intel/i40e/i40e_devlink.c | 57 +++++++++++++++++++ > .../net/ethernet/intel/i40e/i40e_devlink.h | 2 + > drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++++ > 4 files changed, 74 insertions(+) > > diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h > index d546567e0286..f94671b6e7c6 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e.h > +++ b/drivers/net/ethernet/intel/i40e/i40e.h > @@ -465,6 +465,7 @@ static inline const u8 *i40e_channel_mac(struct i40e_channel *ch) > struct i40e_pf { > struct pci_dev *pdev; > struct devlink_port devlink_port; > + struct devlink_health_reporter *fw_health_report; > struct i40e_hw hw; > DECLARE_BITMAP(state, __I40E_STATE_SIZE__); > struct msix_entry *msix_entries; > diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.c b/drivers/net/ethernet/intel/i40e/i40e_devlink.c > index cc4e9e2addb7..8fe64284e8d3 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_devlink.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.c > @@ -122,6 +122,25 @@ static int i40e_devlink_info_get(struct devlink *dl, > return err; > } > > +static int i40e_fw_reporter_diagnose(struct devlink_health_reporter *reporter, > + struct devlink_fmsg *fmsg, > + struct netlink_ext_ack *extack) > +{ > + struct i40e_pf *pf = devlink_health_reporter_priv(reporter); > + > + if (test_bit(__I40E_RECOVERY_MODE, pf->state)) > + devlink_fmsg_string_pair_put(fmsg, "Mode", "recovery"); > + else > + devlink_fmsg_string_pair_put(fmsg, "Mode", "normal"); I’d use ternary operator. > + > + return 0; > +} > + > +static const struct devlink_health_reporter_ops i40e_fw_reporter_ops = { > + .name = "fw", > + .diagnose = i40e_fw_reporter_diagnose, > +}; > + > static const struct devlink_ops i40e_devlink_ops = { > .info_get = i40e_devlink_info_get, > }; > @@ -233,3 +252,41 @@ void i40e_devlink_destroy_port(struct i40e_pf *pf) > { > devlink_port_unregister(&pf->devlink_port); > } > + > +/** > + * i40e_devlink_create_health_reporter - Create the health reporter for this PF > + * @pf: the PF to create reporter for > + * > + * Create health reporter for this PF. > + * > + * Return: zero on success or an error code on failure. No dot/period at the end. > + **/ > +int i40e_devlink_create_health_reporter(struct i40e_pf *pf) > +{ > + struct devlink *devlink = priv_to_devlink(pf); > + struct device *dev = &pf->pdev->dev; > + int rc = 0; > + > + devl_lock(devlink); > + pf->fw_health_report = > + devl_health_reporter_create(devlink, &i40e_fw_reporter_ops, 0, pf); > + if (IS_ERR(pf->fw_health_report)) { > + rc = PTR_ERR(pf->fw_health_report); > + dev_err(dev, "Failed to create fw reporter, err = %d\n", rc); > + } > + devl_unlock(devlink); > + > + return rc; > +} > + > +/** > + * i40e_devlink_destroy_health_reporter - Destroy the health reporter > + * @pf: the PF to cleanup The verb is spelled with a space: clean up. > + * > + * Destroy the health reporter > + **/ > +void i40e_devlink_destroy_health_reporter(struct i40e_pf *pf) > +{ > + if (!IS_ERR_OR_NULL(pf->fw_health_report)) > + devlink_health_reporter_destroy(pf->fw_health_report); > +} > diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.h b/drivers/net/ethernet/intel/i40e/i40e_devlink.h > index 469fb3d2ee25..018679094bb5 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_devlink.h > +++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.h > @@ -14,5 +14,7 @@ void i40e_devlink_register(struct i40e_pf *pf); > void i40e_devlink_unregister(struct i40e_pf *pf); > int i40e_devlink_create_port(struct i40e_pf *pf); > void i40e_devlink_destroy_port(struct i40e_pf *pf); > +int i40e_devlink_create_health_reporter(struct i40e_pf *pf); > +void i40e_devlink_destroy_health_reporter(struct i40e_pf *pf); > > #endif /* _I40E_DEVLINK_H_ */ > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c > index cbcfada7b357..b6b3ae299b28 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -15370,6 +15370,9 @@ static bool i40e_check_recovery_mode(struct i40e_pf *pf) > dev_crit(&pf->pdev->dev, "Firmware recovery mode detected. Limiting functionality.\n"); > dev_crit(&pf->pdev->dev, "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); > set_bit(__I40E_RECOVERY_MODE, pf->state); > + if (pf->fw_health_report) > + devlink_health_report(pf->fw_health_report, > + "recovery mode detected", pf); > > return true; > } > @@ -15810,6 +15813,13 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > if (test_bit(__I40E_RECOVERY_MODE, pf->state)) > return i40e_init_recovery_mode(pf, hw); > > + err = i40e_devlink_create_health_reporter(pf); > + if (err) { > + dev_err(&pdev->dev, > + "Failed to create health reporter %d\n", err); > + goto err_health_reporter; > + } > + > err = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp, > hw->func_caps.num_rx_qp, 0, 0); > if (err) { > @@ -16175,6 +16185,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > (void)i40e_shutdown_lan_hmc(hw); > err_init_lan_hmc: > kfree(pf->qp_pile); > + i40e_devlink_destroy_health_reporter(pf); > +err_health_reporter: > err_sw_init: > err_adminq_setup: > err_pf_reset: > @@ -16209,6 +16221,8 @@ static void i40e_remove(struct pci_dev *pdev) > > i40e_devlink_unregister(pf); > > + i40e_devlink_destroy_health_reporter(pf); > + > i40e_dbg_pf_exit(pf); > > i40e_ptp_stop(pf); Kind regards, Paul
On Thu, 18 Jul 2024 14:13:19 -0400 Kamal Heib wrote: > Add support for reporting fw status via the devlink health report. > > Example: > # devlink health show pci/0000:02:00.0 reporter fw > pci/0000:02:00.0: > reporter fw > state healthy error 0 recover 0 > # devlink health diagnose pci/0000:02:00.0 reporter fw > Mode: normal I'm not sure if health reporter is the right fit, depends on what the recovery mode is / does. We need documentation.
On 7/18/2024 11:13 AM, Kamal Heib wrote: > Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. > > > Add support for reporting fw status via the devlink health report. > > Example: > # devlink health show pci/0000:02:00.0 reporter fw > pci/0000:02:00.0: > reporter fw > state healthy error 0 recover 0 > # devlink health diagnose pci/0000:02:00.0 reporter fw > Mode: normal > > Signed-off-by: Kamal Heib <kheib@redhat.com> > --- > v2: > - Address comments from Jiri. > - Move the creation of the health report. > --- > drivers/net/ethernet/intel/i40e/i40e.h | 1 + > .../net/ethernet/intel/i40e/i40e_devlink.c | 57 +++++++++++++++++++ > .../net/ethernet/intel/i40e/i40e_devlink.h | 2 + > drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++++ > 4 files changed, 74 insertions(+) > <snip> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c > index cbcfada7b357..b6b3ae299b28 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -15370,6 +15370,9 @@ static bool i40e_check_recovery_mode(struct i40e_pf *pf) > dev_crit(&pf->pdev->dev, "Firmware recovery mode detected. Limiting functionality.\n"); > dev_crit(&pf->pdev->dev, "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); > set_bit(__I40E_RECOVERY_MODE, pf->state); > + if (pf->fw_health_report) > + devlink_health_report(pf->fw_health_report, > + "recovery mode detected", pf); > > return true; > } > @@ -15810,6 +15813,13 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > if (test_bit(__I40E_RECOVERY_MODE, pf->state)) > return i40e_init_recovery_mode(pf, hw); > > + err = i40e_devlink_create_health_reporter(pf); > + if (err) { > + dev_err(&pdev->dev, > + "Failed to create health reporter %d\n", err); > + goto err_health_reporter; Do you really want to fail probe if creating this simple health reporter fails? Thanks, Brett > + } > + > err = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp, > hw->func_caps.num_rx_qp, 0, 0); > if (err) { > @@ -16175,6 +16185,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) > (void)i40e_shutdown_lan_hmc(hw); > err_init_lan_hmc: > kfree(pf->qp_pile); > + i40e_devlink_destroy_health_reporter(pf); > +err_health_reporter: > err_sw_init: > err_adminq_setup: > err_pf_reset: > @@ -16209,6 +16221,8 @@ static void i40e_remove(struct pci_dev *pdev) > > i40e_devlink_unregister(pf); > > + i40e_devlink_destroy_health_reporter(pf); > + > i40e_dbg_pf_exit(pf); > > i40e_ptp_stop(pf); > -- > 2.45.2 > >
> -----Original Message----- > From: Brett Creeley <bcreeley@amd.com> > Sent: Tuesday, July 23, 2024 1:41 PM > To: Kamal Heib <kheib@redhat.com>; intel-wired-lan@lists.osuosl.org > Cc: netdev@vger.kernel.org; Nguyen, Anthony L <anthony.l.nguyen@intel.com>; > Kitszel, Przemyslaw <przemyslaw.kitszel@intel.com>; ivecera > <ivecera@redhat.com>; Jakub Kicinski <kuba@kernel.org>; David S . Miller > <davem@davemloft.net>; Paolo Abeni <pabeni@redhat.com> > Subject: Re: [PATCH iwl-next v2] i40e: Add support for fw health report > > > > On 7/18/2024 11:13 AM, Kamal Heib wrote: > > Caution: This message originated from an External Source. Use proper caution > when opening attachments, clicking links, or responding. > > > > > > Add support for reporting fw status via the devlink health report. > > > > Example: > > # devlink health show pci/0000:02:00.0 reporter fw > > pci/0000:02:00.0: > > reporter fw > > state healthy error 0 recover 0 > > # devlink health diagnose pci/0000:02:00.0 reporter fw > > Mode: normal > > > > Signed-off-by: Kamal Heib <kheib@redhat.com> > > --- > > v2: > > - Address comments from Jiri. > > - Move the creation of the health report. > > --- > > drivers/net/ethernet/intel/i40e/i40e.h | 1 + > > .../net/ethernet/intel/i40e/i40e_devlink.c | 57 +++++++++++++++++++ > > .../net/ethernet/intel/i40e/i40e_devlink.h | 2 + > > drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++++ > > 4 files changed, 74 insertions(+) > > > > <snip> > > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c > b/drivers/net/ethernet/intel/i40e/i40e_main.c > > index cbcfada7b357..b6b3ae299b28 100644 > > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > > @@ -15370,6 +15370,9 @@ static bool i40e_check_recovery_mode(struct > i40e_pf *pf) > > dev_crit(&pf->pdev->dev, "Firmware recovery mode detected. Limiting > functionality.\n"); > > dev_crit(&pf->pdev->dev, "Refer to the Intel(R) Ethernet Adapters and > Devices User Guide for details on firmware recovery mode.\n"); > > set_bit(__I40E_RECOVERY_MODE, pf->state); > > + if (pf->fw_health_report) > > + devlink_health_report(pf->fw_health_report, > > + "recovery mode detected", pf); > > > > return true; > > } > > @@ -15810,6 +15813,13 @@ static int i40e_probe(struct pci_dev *pdev, const > struct pci_device_id *ent) > > if (test_bit(__I40E_RECOVERY_MODE, pf->state)) > > return i40e_init_recovery_mode(pf, hw); > > > > + err = i40e_devlink_create_health_reporter(pf); > > + if (err) { > > + dev_err(&pdev->dev, > > + "Failed to create health reporter %d\n", err); > > + goto err_health_reporter; > > Do you really want to fail probe if creating this simple health reporter > fails? > > Thanks, > > Brett I agree. I would make this a dev_warn and continue without failure. Thanks, Jake
On 7/18/24 20:13, Kamal Heib wrote: > Add support for reporting fw status via the devlink health report. > > Example: > # devlink health show pci/0000:02:00.0 reporter fw > pci/0000:02:00.0: > reporter fw > state healthy error 0 recover 0 > # devlink health diagnose pci/0000:02:00.0 reporter fw > Mode: normal > > Signed-off-by: Kamal Heib <kheib@redhat.com> > --- > v2: > - Address comments from Jiri. > - Move the creation of the health report. > --- > drivers/net/ethernet/intel/i40e/i40e.h | 1 + > .../net/ethernet/intel/i40e/i40e_devlink.c | 57 +++++++++++++++++++ > .../net/ethernet/intel/i40e/i40e_devlink.h | 2 + > drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++++ > 4 files changed, 74 insertions(+) > [...] > +int i40e_devlink_create_health_reporter(struct i40e_pf *pf) > +{ > + struct devlink *devlink = priv_to_devlink(pf); > + struct device *dev = &pf->pdev->dev; > + int rc = 0; > + > + devl_lock(devlink); if you are going to have just one health reporter, you could just use devlink_health_reporter_create() and avoid lock+unlock calls here > + pf->fw_health_report = > + devl_health_reporter_create(devlink, &i40e_fw_reporter_ops, 0, pf); > + if (IS_ERR(pf->fw_health_report)) { > + rc = PTR_ERR(pf->fw_health_report); > + dev_err(dev, "Failed to create fw reporter, err = %d\n", rc); you are not zeroing pf->fw_health_report here, so ... > + } > + devl_unlock(devlink); > + > + return rc; > +} [snip] > --- a/drivers/net/ethernet/intel/i40e/i40e_main.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c > @@ -15370,6 +15370,9 @@ static bool i40e_check_recovery_mode(struct i40e_pf *pf) > dev_crit(&pf->pdev->dev, "Firmware recovery mode detected. Limiting functionality.\n"); > dev_crit(&pf->pdev->dev, "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); > set_bit(__I40E_RECOVERY_MODE, pf->state); > + if (pf->fw_health_report) > + devlink_health_report(pf->fw_health_report, ... you could possibly call devlink_health_report() with ERR_PTR as the first argument > + "recovery mode detected", pf); > > return true; > }
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index d546567e0286..f94671b6e7c6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -465,6 +465,7 @@ static inline const u8 *i40e_channel_mac(struct i40e_channel *ch) struct i40e_pf { struct pci_dev *pdev; struct devlink_port devlink_port; + struct devlink_health_reporter *fw_health_report; struct i40e_hw hw; DECLARE_BITMAP(state, __I40E_STATE_SIZE__); struct msix_entry *msix_entries; diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.c b/drivers/net/ethernet/intel/i40e/i40e_devlink.c index cc4e9e2addb7..8fe64284e8d3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_devlink.c +++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.c @@ -122,6 +122,25 @@ static int i40e_devlink_info_get(struct devlink *dl, return err; } +static int i40e_fw_reporter_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) +{ + struct i40e_pf *pf = devlink_health_reporter_priv(reporter); + + if (test_bit(__I40E_RECOVERY_MODE, pf->state)) + devlink_fmsg_string_pair_put(fmsg, "Mode", "recovery"); + else + devlink_fmsg_string_pair_put(fmsg, "Mode", "normal"); + + return 0; +} + +static const struct devlink_health_reporter_ops i40e_fw_reporter_ops = { + .name = "fw", + .diagnose = i40e_fw_reporter_diagnose, +}; + static const struct devlink_ops i40e_devlink_ops = { .info_get = i40e_devlink_info_get, }; @@ -233,3 +252,41 @@ void i40e_devlink_destroy_port(struct i40e_pf *pf) { devlink_port_unregister(&pf->devlink_port); } + +/** + * i40e_devlink_create_health_reporter - Create the health reporter for this PF + * @pf: the PF to create reporter for + * + * Create health reporter for this PF. + * + * Return: zero on success or an error code on failure. + **/ +int i40e_devlink_create_health_reporter(struct i40e_pf *pf) +{ + struct devlink *devlink = priv_to_devlink(pf); + struct device *dev = &pf->pdev->dev; + int rc = 0; + + devl_lock(devlink); + pf->fw_health_report = + devl_health_reporter_create(devlink, &i40e_fw_reporter_ops, 0, pf); + if (IS_ERR(pf->fw_health_report)) { + rc = PTR_ERR(pf->fw_health_report); + dev_err(dev, "Failed to create fw reporter, err = %d\n", rc); + } + devl_unlock(devlink); + + return rc; +} + +/** + * i40e_devlink_destroy_health_reporter - Destroy the health reporter + * @pf: the PF to cleanup + * + * Destroy the health reporter + **/ +void i40e_devlink_destroy_health_reporter(struct i40e_pf *pf) +{ + if (!IS_ERR_OR_NULL(pf->fw_health_report)) + devlink_health_reporter_destroy(pf->fw_health_report); +} diff --git a/drivers/net/ethernet/intel/i40e/i40e_devlink.h b/drivers/net/ethernet/intel/i40e/i40e_devlink.h index 469fb3d2ee25..018679094bb5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_devlink.h +++ b/drivers/net/ethernet/intel/i40e/i40e_devlink.h @@ -14,5 +14,7 @@ void i40e_devlink_register(struct i40e_pf *pf); void i40e_devlink_unregister(struct i40e_pf *pf); int i40e_devlink_create_port(struct i40e_pf *pf); void i40e_devlink_destroy_port(struct i40e_pf *pf); +int i40e_devlink_create_health_reporter(struct i40e_pf *pf); +void i40e_devlink_destroy_health_reporter(struct i40e_pf *pf); #endif /* _I40E_DEVLINK_H_ */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index cbcfada7b357..b6b3ae299b28 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -15370,6 +15370,9 @@ static bool i40e_check_recovery_mode(struct i40e_pf *pf) dev_crit(&pf->pdev->dev, "Firmware recovery mode detected. Limiting functionality.\n"); dev_crit(&pf->pdev->dev, "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); set_bit(__I40E_RECOVERY_MODE, pf->state); + if (pf->fw_health_report) + devlink_health_report(pf->fw_health_report, + "recovery mode detected", pf); return true; } @@ -15810,6 +15813,13 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (test_bit(__I40E_RECOVERY_MODE, pf->state)) return i40e_init_recovery_mode(pf, hw); + err = i40e_devlink_create_health_reporter(pf); + if (err) { + dev_err(&pdev->dev, + "Failed to create health reporter %d\n", err); + goto err_health_reporter; + } + err = i40e_init_lan_hmc(hw, hw->func_caps.num_tx_qp, hw->func_caps.num_rx_qp, 0, 0); if (err) { @@ -16175,6 +16185,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) (void)i40e_shutdown_lan_hmc(hw); err_init_lan_hmc: kfree(pf->qp_pile); + i40e_devlink_destroy_health_reporter(pf); +err_health_reporter: err_sw_init: err_adminq_setup: err_pf_reset: @@ -16209,6 +16221,8 @@ static void i40e_remove(struct pci_dev *pdev) i40e_devlink_unregister(pf); + i40e_devlink_destroy_health_reporter(pf); + i40e_dbg_pf_exit(pf); i40e_ptp_stop(pf);
Add support for reporting fw status via the devlink health report. Example: # devlink health show pci/0000:02:00.0 reporter fw pci/0000:02:00.0: reporter fw state healthy error 0 recover 0 # devlink health diagnose pci/0000:02:00.0 reporter fw Mode: normal Signed-off-by: Kamal Heib <kheib@redhat.com> --- v2: - Address comments from Jiri. - Move the creation of the health report. --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + .../net/ethernet/intel/i40e/i40e_devlink.c | 57 +++++++++++++++++++ .../net/ethernet/intel/i40e/i40e_devlink.h | 2 + drivers/net/ethernet/intel/i40e/i40e_main.c | 14 +++++ 4 files changed, 74 insertions(+)