Message ID | 20230308051310.12544-4-shannon.nelson@amd.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | pds_core driver | expand |
Wed, Mar 08, 2023 at 06:13:00AM CET, shannon.nelson@amd.com wrote: >Add in the periodic health check and the related workqueue, >as well as the handlers for when a FW reset is seen. Why don't you use devlink health to let the user know that something odd happened with HW? > >Signed-off-by: Shannon Nelson <shannon.nelson@amd.com> >--- > drivers/net/ethernet/amd/pds_core/core.c | 60 ++++++++++++++++++++++++ > drivers/net/ethernet/amd/pds_core/dev.c | 3 ++ > drivers/net/ethernet/amd/pds_core/main.c | 51 ++++++++++++++++++++ > include/linux/pds/pds_core.h | 10 ++++ > 4 files changed, 124 insertions(+) > >diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c >index 88a6aa42cc28..87717bf40e80 100644 >--- a/drivers/net/ethernet/amd/pds_core/core.c >+++ b/drivers/net/ethernet/amd/pds_core/core.c >@@ -34,3 +34,63 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing) > > set_bit(PDSC_S_FW_DEAD, &pdsc->state); > } >+ >+static void pdsc_fw_down(struct pdsc *pdsc) >+{ >+ mutex_lock(&pdsc->config_lock); >+ >+ if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) { >+ dev_err(pdsc->dev, "%s: already happening\n", __func__); >+ mutex_unlock(&pdsc->config_lock); >+ return; >+ } >+ >+ pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); >+ >+ mutex_unlock(&pdsc->config_lock); >+} >+ >+static void pdsc_fw_up(struct pdsc *pdsc) >+{ >+ int err; >+ >+ mutex_lock(&pdsc->config_lock); >+ >+ if (!test_bit(PDSC_S_FW_DEAD, &pdsc->state)) { >+ dev_err(pdsc->dev, "%s: fw not dead\n", __func__); >+ mutex_unlock(&pdsc->config_lock); >+ return; >+ } >+ >+ err = pdsc_setup(pdsc, PDSC_SETUP_RECOVERY); >+ if (err) >+ goto err_out; >+ >+ mutex_unlock(&pdsc->config_lock); >+ >+ return; >+ >+err_out: >+ pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); >+ mutex_unlock(&pdsc->config_lock); >+} >+ >+void pdsc_health_thread(struct work_struct *work) >+{ >+ struct pdsc *pdsc = container_of(work, struct pdsc, health_work); >+ bool healthy; >+ >+ healthy = pdsc_is_fw_good(pdsc); >+ dev_dbg(pdsc->dev, "%s: health %d fw_status %#02x fw_heartbeat %d\n", >+ __func__, healthy, pdsc->fw_status, pdsc->last_hb); >+ >+ if (test_bit(PDSC_S_FW_DEAD, &pdsc->state)) { >+ if (healthy) >+ pdsc_fw_up(pdsc); >+ } else { >+ if (!healthy) >+ pdsc_fw_down(pdsc); >+ } >+ >+ pdsc->fw_generation = pdsc->fw_status & PDS_CORE_FW_STS_F_GENERATION; >+} >diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c >index 7a9db1505c1f..43229c149b2f 100644 >--- a/drivers/net/ethernet/amd/pds_core/dev.c >+++ b/drivers/net/ethernet/amd/pds_core/dev.c >@@ -177,6 +177,9 @@ int pdsc_devcmd_locked(struct pdsc *pdsc, union pds_core_dev_cmd *cmd, > err = pdsc_devcmd_wait(pdsc, max_seconds); > memcpy_fromio(comp, &pdsc->cmd_regs->comp, sizeof(*comp)); > >+ if (err == -ENXIO || err == -ETIMEDOUT) >+ pdsc_queue_health_check(pdsc); >+ > return err; > } > >diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c >index 0c63bbe8b417..ac1b30f78e0e 100644 >--- a/drivers/net/ethernet/amd/pds_core/main.c >+++ b/drivers/net/ethernet/amd/pds_core/main.c >@@ -25,6 +25,31 @@ static const struct pci_device_id pdsc_id_table[] = { > }; > MODULE_DEVICE_TABLE(pci, pdsc_id_table); > >+void pdsc_queue_health_check(struct pdsc *pdsc) >+{ >+ unsigned long mask; >+ >+ /* Don't do a check when in a transition state */ >+ mask = BIT_ULL(PDSC_S_INITING_DRIVER) | >+ BIT_ULL(PDSC_S_STOPPING_DRIVER); >+ if (pdsc->state & mask) >+ return; >+ >+ /* Queue a new health check if one isn't already queued */ >+ queue_work(pdsc->wq, &pdsc->health_work); >+} >+ >+static void pdsc_wdtimer_cb(struct timer_list *t) >+{ >+ struct pdsc *pdsc = from_timer(pdsc, t, wdtimer); >+ >+ dev_dbg(pdsc->dev, "%s: jiffies %ld\n", __func__, jiffies); >+ mod_timer(&pdsc->wdtimer, >+ round_jiffies(jiffies + pdsc->wdtimer_period)); >+ >+ pdsc_queue_health_check(pdsc); >+} >+ > static void pdsc_unmap_bars(struct pdsc *pdsc) > { > struct pdsc_dev_bar *bars = pdsc->bars; >@@ -135,8 +160,11 @@ static int pdsc_init_vf(struct pdsc *pdsc) > return -1; > } > >+#define PDSC_WQ_NAME_LEN 24 >+ > static int pdsc_init_pf(struct pdsc *pdsc) > { >+ char wq_name[PDSC_WQ_NAME_LEN]; > int err; > > pcie_print_link_status(pdsc->pdev); >@@ -151,6 +179,13 @@ static int pdsc_init_pf(struct pdsc *pdsc) > if (err) > goto err_out_release_regions; > >+ /* General workqueue and timer, but don't start timer yet */ >+ snprintf(wq_name, sizeof(wq_name), "%s.%d", PDS_CORE_DRV_NAME, pdsc->id); >+ pdsc->wq = create_singlethread_workqueue(wq_name); >+ INIT_WORK(&pdsc->health_work, pdsc_health_thread); >+ timer_setup(&pdsc->wdtimer, pdsc_wdtimer_cb, 0); >+ pdsc->wdtimer_period = PDSC_WATCHDOG_SECS * HZ; >+ > mutex_init(&pdsc->devcmd_lock); > mutex_init(&pdsc->config_lock); > >@@ -167,11 +202,20 @@ static int pdsc_init_pf(struct pdsc *pdsc) > > mutex_unlock(&pdsc->config_lock); > >+ /* Lastly, start the health check timer */ >+ mod_timer(&pdsc->wdtimer, round_jiffies(jiffies + pdsc->wdtimer_period)); >+ > return 0; > > err_out_teardown: > pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING); > err_out_unmap_bars: >+ del_timer_sync(&pdsc->wdtimer); >+ if (pdsc->wq) { >+ flush_workqueue(pdsc->wq); >+ destroy_workqueue(pdsc->wq); >+ pdsc->wq = NULL; >+ } > mutex_unlock(&pdsc->config_lock); > mutex_destroy(&pdsc->config_lock); > mutex_destroy(&pdsc->devcmd_lock); >@@ -262,6 +306,13 @@ static void pdsc_remove(struct pci_dev *pdev) > mutex_lock(&pdsc->config_lock); > set_bit(PDSC_S_STOPPING_DRIVER, &pdsc->state); > >+ del_timer_sync(&pdsc->wdtimer); >+ if (pdsc->wq) { >+ flush_workqueue(pdsc->wq); >+ destroy_workqueue(pdsc->wq); >+ pdsc->wq = NULL; >+ } >+ > pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING); > mutex_unlock(&pdsc->config_lock); > mutex_destroy(&pdsc->config_lock); >diff --git a/include/linux/pds/pds_core.h b/include/linux/pds/pds_core.h >index da0663925018..e73af422fae0 100644 >--- a/include/linux/pds/pds_core.h >+++ b/include/linux/pds/pds_core.h >@@ -12,6 +12,8 @@ > #include <linux/pds/pds_intr.h> > > #define PDSC_DRV_DESCRIPTION "AMD/Pensando Core Driver" >+ >+#define PDSC_WATCHDOG_SECS 5 > #define PDSC_TEARDOWN_RECOVERY false > #define PDSC_TEARDOWN_REMOVING true > #define PDSC_SETUP_RECOVERY false >@@ -63,12 +65,17 @@ struct pdsc { > u8 fw_generation; > unsigned long last_fw_time; > u32 last_hb; >+ struct timer_list wdtimer; >+ unsigned int wdtimer_period; >+ struct work_struct health_work; > > struct pdsc_devinfo dev_info; > struct pds_core_dev_identity dev_ident; > unsigned int nintrs; > struct pdsc_intr_info *intr_info; /* array of nintrs elements */ > >+ struct workqueue_struct *wq; >+ > unsigned int devcmd_timeout; > struct mutex devcmd_lock; /* lock for dev_cmd operations */ > struct mutex config_lock; /* lock for configuration operations */ >@@ -81,6 +88,8 @@ struct pdsc { > u64 __iomem *kern_dbpage; > }; > >+void pdsc_queue_health_check(struct pdsc *pdsc); >+ > struct pdsc *pdsc_dl_alloc(struct device *dev, bool is_pf); > void pdsc_dl_free(struct pdsc *pdsc); > int pdsc_dl_register(struct pdsc *pdsc); >@@ -116,5 +125,6 @@ int pdsc_dev_init(struct pdsc *pdsc); > > int pdsc_setup(struct pdsc *pdsc, bool init); > void pdsc_teardown(struct pdsc *pdsc, bool removing); >+void pdsc_health_thread(struct work_struct *work); > > #endif /* _PDSC_H_ */ >-- >2.17.1 >
On 3/8/23 1:39 AM, Jiri Pirko wrote: > Wed, Mar 08, 2023 at 06:13:00AM CET, shannon.nelson@amd.com wrote: >> Add in the periodic health check and the related workqueue, >> as well as the handlers for when a FW reset is seen. > > Why don't you use devlink health to let the user know that something odd > happened with HW? Just haven't gotten to that yet. sln
Thu, Mar 09, 2023 at 03:08:44AM CET, shannon.nelson@amd.com wrote: >On 3/8/23 1:39 AM, Jiri Pirko wrote: >> Wed, Mar 08, 2023 at 06:13:00AM CET, shannon.nelson@amd.com wrote: >> > Add in the periodic health check and the related workqueue, >> > as well as the handlers for when a FW reset is seen. >> >> Why don't you use devlink health to let the user know that something odd >> happened with HW? > >Just haven't gotten to that yet. Please do it from start, this is exactly why devlink health was introduced. >sln >
On 3/9/23 1:30 AM, Jiri Pirko wrote: > Thu, Mar 09, 2023 at 03:08:44AM CET, shannon.nelson@amd.com wrote: >> On 3/8/23 1:39 AM, Jiri Pirko wrote: >>> Wed, Mar 08, 2023 at 06:13:00AM CET, shannon.nelson@amd.com wrote: >>>> Add in the periodic health check and the related workqueue, >>>> as well as the handlers for when a FW reset is seen. >>> >>> Why don't you use devlink health to let the user know that something odd >>> happened with HW? >> >> Just haven't gotten to that yet. > > Please do it from start, this is exactly why devlink health was > introduced. I'll add it in the next rev. sln
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 88a6aa42cc28..87717bf40e80 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -34,3 +34,63 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing) set_bit(PDSC_S_FW_DEAD, &pdsc->state); } + +static void pdsc_fw_down(struct pdsc *pdsc) +{ + mutex_lock(&pdsc->config_lock); + + if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) { + dev_err(pdsc->dev, "%s: already happening\n", __func__); + mutex_unlock(&pdsc->config_lock); + return; + } + + pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); + + mutex_unlock(&pdsc->config_lock); +} + +static void pdsc_fw_up(struct pdsc *pdsc) +{ + int err; + + mutex_lock(&pdsc->config_lock); + + if (!test_bit(PDSC_S_FW_DEAD, &pdsc->state)) { + dev_err(pdsc->dev, "%s: fw not dead\n", __func__); + mutex_unlock(&pdsc->config_lock); + return; + } + + err = pdsc_setup(pdsc, PDSC_SETUP_RECOVERY); + if (err) + goto err_out; + + mutex_unlock(&pdsc->config_lock); + + return; + +err_out: + pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY); + mutex_unlock(&pdsc->config_lock); +} + +void pdsc_health_thread(struct work_struct *work) +{ + struct pdsc *pdsc = container_of(work, struct pdsc, health_work); + bool healthy; + + healthy = pdsc_is_fw_good(pdsc); + dev_dbg(pdsc->dev, "%s: health %d fw_status %#02x fw_heartbeat %d\n", + __func__, healthy, pdsc->fw_status, pdsc->last_hb); + + if (test_bit(PDSC_S_FW_DEAD, &pdsc->state)) { + if (healthy) + pdsc_fw_up(pdsc); + } else { + if (!healthy) + pdsc_fw_down(pdsc); + } + + pdsc->fw_generation = pdsc->fw_status & PDS_CORE_FW_STS_F_GENERATION; +} diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c index 7a9db1505c1f..43229c149b2f 100644 --- a/drivers/net/ethernet/amd/pds_core/dev.c +++ b/drivers/net/ethernet/amd/pds_core/dev.c @@ -177,6 +177,9 @@ int pdsc_devcmd_locked(struct pdsc *pdsc, union pds_core_dev_cmd *cmd, err = pdsc_devcmd_wait(pdsc, max_seconds); memcpy_fromio(comp, &pdsc->cmd_regs->comp, sizeof(*comp)); + if (err == -ENXIO || err == -ETIMEDOUT) + pdsc_queue_health_check(pdsc); + return err; } diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index 0c63bbe8b417..ac1b30f78e0e 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -25,6 +25,31 @@ static const struct pci_device_id pdsc_id_table[] = { }; MODULE_DEVICE_TABLE(pci, pdsc_id_table); +void pdsc_queue_health_check(struct pdsc *pdsc) +{ + unsigned long mask; + + /* Don't do a check when in a transition state */ + mask = BIT_ULL(PDSC_S_INITING_DRIVER) | + BIT_ULL(PDSC_S_STOPPING_DRIVER); + if (pdsc->state & mask) + return; + + /* Queue a new health check if one isn't already queued */ + queue_work(pdsc->wq, &pdsc->health_work); +} + +static void pdsc_wdtimer_cb(struct timer_list *t) +{ + struct pdsc *pdsc = from_timer(pdsc, t, wdtimer); + + dev_dbg(pdsc->dev, "%s: jiffies %ld\n", __func__, jiffies); + mod_timer(&pdsc->wdtimer, + round_jiffies(jiffies + pdsc->wdtimer_period)); + + pdsc_queue_health_check(pdsc); +} + static void pdsc_unmap_bars(struct pdsc *pdsc) { struct pdsc_dev_bar *bars = pdsc->bars; @@ -135,8 +160,11 @@ static int pdsc_init_vf(struct pdsc *pdsc) return -1; } +#define PDSC_WQ_NAME_LEN 24 + static int pdsc_init_pf(struct pdsc *pdsc) { + char wq_name[PDSC_WQ_NAME_LEN]; int err; pcie_print_link_status(pdsc->pdev); @@ -151,6 +179,13 @@ static int pdsc_init_pf(struct pdsc *pdsc) if (err) goto err_out_release_regions; + /* General workqueue and timer, but don't start timer yet */ + snprintf(wq_name, sizeof(wq_name), "%s.%d", PDS_CORE_DRV_NAME, pdsc->id); + pdsc->wq = create_singlethread_workqueue(wq_name); + INIT_WORK(&pdsc->health_work, pdsc_health_thread); + timer_setup(&pdsc->wdtimer, pdsc_wdtimer_cb, 0); + pdsc->wdtimer_period = PDSC_WATCHDOG_SECS * HZ; + mutex_init(&pdsc->devcmd_lock); mutex_init(&pdsc->config_lock); @@ -167,11 +202,20 @@ static int pdsc_init_pf(struct pdsc *pdsc) mutex_unlock(&pdsc->config_lock); + /* Lastly, start the health check timer */ + mod_timer(&pdsc->wdtimer, round_jiffies(jiffies + pdsc->wdtimer_period)); + return 0; err_out_teardown: pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING); err_out_unmap_bars: + del_timer_sync(&pdsc->wdtimer); + if (pdsc->wq) { + flush_workqueue(pdsc->wq); + destroy_workqueue(pdsc->wq); + pdsc->wq = NULL; + } mutex_unlock(&pdsc->config_lock); mutex_destroy(&pdsc->config_lock); mutex_destroy(&pdsc->devcmd_lock); @@ -262,6 +306,13 @@ static void pdsc_remove(struct pci_dev *pdev) mutex_lock(&pdsc->config_lock); set_bit(PDSC_S_STOPPING_DRIVER, &pdsc->state); + del_timer_sync(&pdsc->wdtimer); + if (pdsc->wq) { + flush_workqueue(pdsc->wq); + destroy_workqueue(pdsc->wq); + pdsc->wq = NULL; + } + pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING); mutex_unlock(&pdsc->config_lock); mutex_destroy(&pdsc->config_lock); diff --git a/include/linux/pds/pds_core.h b/include/linux/pds/pds_core.h index da0663925018..e73af422fae0 100644 --- a/include/linux/pds/pds_core.h +++ b/include/linux/pds/pds_core.h @@ -12,6 +12,8 @@ #include <linux/pds/pds_intr.h> #define PDSC_DRV_DESCRIPTION "AMD/Pensando Core Driver" + +#define PDSC_WATCHDOG_SECS 5 #define PDSC_TEARDOWN_RECOVERY false #define PDSC_TEARDOWN_REMOVING true #define PDSC_SETUP_RECOVERY false @@ -63,12 +65,17 @@ struct pdsc { u8 fw_generation; unsigned long last_fw_time; u32 last_hb; + struct timer_list wdtimer; + unsigned int wdtimer_period; + struct work_struct health_work; struct pdsc_devinfo dev_info; struct pds_core_dev_identity dev_ident; unsigned int nintrs; struct pdsc_intr_info *intr_info; /* array of nintrs elements */ + struct workqueue_struct *wq; + unsigned int devcmd_timeout; struct mutex devcmd_lock; /* lock for dev_cmd operations */ struct mutex config_lock; /* lock for configuration operations */ @@ -81,6 +88,8 @@ struct pdsc { u64 __iomem *kern_dbpage; }; +void pdsc_queue_health_check(struct pdsc *pdsc); + struct pdsc *pdsc_dl_alloc(struct device *dev, bool is_pf); void pdsc_dl_free(struct pdsc *pdsc); int pdsc_dl_register(struct pdsc *pdsc); @@ -116,5 +125,6 @@ int pdsc_dev_init(struct pdsc *pdsc); int pdsc_setup(struct pdsc *pdsc, bool init); void pdsc_teardown(struct pdsc *pdsc, bool removing); +void pdsc_health_thread(struct work_struct *work); #endif /* _PDSC_H_ */
Add in the periodic health check and the related workqueue, as well as the handlers for when a FW reset is seen. Signed-off-by: Shannon Nelson <shannon.nelson@amd.com> --- drivers/net/ethernet/amd/pds_core/core.c | 60 ++++++++++++++++++++++++ drivers/net/ethernet/amd/pds_core/dev.c | 3 ++ drivers/net/ethernet/amd/pds_core/main.c | 51 ++++++++++++++++++++ include/linux/pds/pds_core.h | 10 ++++ 4 files changed, 124 insertions(+)