diff mbox series

[net-next,v4,8/8] octeon_ep: add heartbeat monitor

Message ID 20230322091958.13103-9-vburru@marvell.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series octeon_ep: deferred probe and mailbox | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 18 this patch: 18
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 18 this patch: 18
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 18 this patch: 18
netdev/checkpatch warning WARNING: line length of 84 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Veerasenareddy Burru March 22, 2023, 9:19 a.m. UTC
Monitor periodic heartbeat messages from device firmware.
Presence of heartbeat indicates the device is active and running.
If the heartbeat is missed for configured interval indicates
firmware has crashed and device is unusable; in this case, PF driver
stops and uninitialize the device.

Signed-off-by: Veerasenareddy Burru <vburru@marvell.com>
Signed-off-by: Abhijit Ayarekar <aayarekar@marvell.com>
---
v3 -> v4:
 * 0007-xxx.patch in v3 is 0008-xxx.patch in v4.

v2 -> v3:
 * 0009-xxx.patch in v2 is now 0007-xxx.patch in v3 due to
   0007 and 0008.patch from v2 are removed in v3.

v1 -> v2:
 * no change

 .../marvell/octeon_ep/octep_cn9k_pf.c         |  9 ++++
 .../ethernet/marvell/octeon_ep/octep_config.h |  6 +++
 .../ethernet/marvell/octeon_ep/octep_main.c   | 45 ++++++++++++++++++-
 .../ethernet/marvell/octeon_ep/octep_main.h   |  7 +++
 .../marvell/octeon_ep/octep_regs_cn9k_pf.h    |  2 +
 5 files changed, 67 insertions(+), 2 deletions(-)

Comments

Leon Romanovsky March 23, 2023, 10:47 a.m. UTC | #1
On Wed, Mar 22, 2023 at 02:19:57AM -0700, Veerasenareddy Burru wrote:
> Monitor periodic heartbeat messages from device firmware.
> Presence of heartbeat indicates the device is active and running.
> If the heartbeat is missed for configured interval indicates
> firmware has crashed and device is unusable; in this case, PF driver
> stops and uninitialize the device.
> 
> Signed-off-by: Veerasenareddy Burru <vburru@marvell.com>
> Signed-off-by: Abhijit Ayarekar <aayarekar@marvell.com>
> ---
> v3 -> v4:
>  * 0007-xxx.patch in v3 is 0008-xxx.patch in v4.
> 
> v2 -> v3:
>  * 0009-xxx.patch in v2 is now 0007-xxx.patch in v3 due to
>    0007 and 0008.patch from v2 are removed in v3.
> 
> v1 -> v2:
>  * no change
> 
>  .../marvell/octeon_ep/octep_cn9k_pf.c         |  9 ++++
>  .../ethernet/marvell/octeon_ep/octep_config.h |  6 +++
>  .../ethernet/marvell/octeon_ep/octep_main.c   | 45 ++++++++++++++++++-
>  .../ethernet/marvell/octeon_ep/octep_main.h   |  7 +++
>  .../marvell/octeon_ep/octep_regs_cn9k_pf.h    |  2 +
>  5 files changed, 67 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c b/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> index e2503c9bc8a1..90c3a419932d 100644
> --- a/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> @@ -16,6 +16,9 @@
>  #define CTRL_MBOX_MAX_PF	128
>  #define CTRL_MBOX_SZ		((size_t)(0x400000 / CTRL_MBOX_MAX_PF))
>  
> +#define FW_HB_INTERVAL_IN_SECS		1
> +#define FW_HB_MISS_COUNT		10
> +
>  /* Names of Hardware non-queue generic interrupts */
>  static char *cn93_non_ioq_msix_names[] = {
>  	"epf_ire_rint",
> @@ -249,6 +252,10 @@ static void octep_init_config_cn93_pf(struct octep_device *oct)
>  	conf->ctrl_mbox_cfg.barmem_addr = (void __iomem *)oct->mmio[2].hw_addr +
>  					   (0x400000ull * 7) +
>  					   (link * CTRL_MBOX_SZ);
> +
> +	conf->hb_interval = FW_HB_INTERVAL_IN_SECS;
> +	conf->max_hb_miss_cnt = FW_HB_MISS_COUNT;
> +
>  }
>  
>  /* Setup registers for a hardware Tx Queue  */
> @@ -383,6 +390,8 @@ static bool octep_poll_non_ioq_interrupts_cn93_pf(struct octep_device *oct)
>  		octep_write_csr64(oct, CN93_SDP_EPF_OEI_RINT, reg0);
>  		if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX)
>  			queue_work(octep_wq, &oct->ctrl_mbox_task);
> +		else if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT)
> +			atomic_set(&oct->hb_miss_cnt, 0);
>  
>  		handled = true;
>  	}
> diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> index f208f3f9a447..df7cd39d9fce 100644
> --- a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> @@ -200,5 +200,11 @@ struct octep_config {
>  
>  	/* ctrl mbox config */
>  	struct octep_ctrl_mbox_config ctrl_mbox_cfg;
> +
> +	/* Configured maximum heartbeat miss count */
> +	u32 max_hb_miss_cnt;
> +
> +	/* Configured firmware heartbeat interval in secs */
> +	u32 hb_interval;
>  };
>  #endif /* _OCTEP_CONFIG_H_ */
> diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> index ba0d5fe3081d..415dd06ff344 100644
> --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> @@ -901,6 +901,38 @@ static void octep_intr_poll_task(struct work_struct *work)
>  			   msecs_to_jiffies(OCTEP_INTR_POLL_TIME_MSECS));
>  }
>  
> +/**
> + * octep_hb_timeout_task - work queue task to check firmware heartbeat.
> + *
> + * @work: pointer to hb work_struct
> + *
> + * Check for heartbeat miss count. Uninitialize oct device if miss count
> + * exceeds configured max heartbeat miss count.
> + *
> + **/
> +static void octep_hb_timeout_task(struct work_struct *work)
> +{
> +	struct octep_device *oct = container_of(work, struct octep_device,
> +						hb_task.work);
> +
> +	int miss_cnt;
> +
> +	atomic_inc(&oct->hb_miss_cnt);
> +	miss_cnt = atomic_read(&oct->hb_miss_cnt);

miss_cnt = atomic_inc_return(&oct->hb_miss_cnt);

> +	if (miss_cnt < oct->conf->max_hb_miss_cnt) {

How is this heartbeat working? You increment on every entry to octep_hb_timeout_task(),
After max_hb_miss_cnt invocations, you will stop your device.

Thanks

> +		queue_delayed_work(octep_wq, &oct->hb_task,
> +				   msecs_to_jiffies(oct->conf->hb_interval * 1000));
> +		return;
> +	}
> +
> +	dev_err(&oct->pdev->dev, "Missed %u heartbeats. Uninitializing\n",
> +		miss_cnt);
> +	rtnl_lock();
> +	if (netif_running(oct->netdev))
> +		octep_stop(oct->netdev);
> +	rtnl_unlock();
> +}
> +
>  /**
>   * octep_ctrl_mbox_task - work queue task to handle ctrl mbox messages.
>   *
> @@ -938,7 +970,7 @@ static const char *octep_devid_to_str(struct octep_device *oct)
>  int octep_device_setup(struct octep_device *oct)
>  {
>  	struct pci_dev *pdev = oct->pdev;
> -	int i;
> +	int i, ret;
>  
>  	/* allocate memory for oct->conf */
>  	oct->conf = kzalloc(sizeof(*oct->conf), GFP_KERNEL);
> @@ -973,7 +1005,15 @@ int octep_device_setup(struct octep_device *oct)
>  
>  	oct->pkind = CFG_GET_IQ_PKIND(oct->conf);
>  
> -	return octep_ctrl_net_init(oct);
> +	ret = octep_ctrl_net_init(oct);
> +	if (ret)
> +		return ret;
> +
> +	atomic_set(&oct->hb_miss_cnt, 0);
> +	INIT_DELAYED_WORK(&oct->hb_task, octep_hb_timeout_task);
> +	queue_delayed_work(octep_wq, &oct->hb_task,
> +			   msecs_to_jiffies(oct->conf->hb_interval * 1000));
> +	return 0;
>  
>  unsupported_dev:
>  	for (i = 0; i < OCTEP_MMIO_REGIONS; i++)
> @@ -1002,6 +1042,7 @@ static void octep_device_cleanup(struct octep_device *oct)
>  	}
>  
>  	octep_ctrl_net_uninit(oct);
> +	cancel_delayed_work_sync(&oct->hb_task);
>  
>  	oct->hw_ops.soft_reset(oct);
>  	for (i = 0; i < OCTEP_MMIO_REGIONS; i++) {
> diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> index 836d990ba3fa..e0907a719133 100644
> --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> @@ -280,6 +280,13 @@ struct octep_device {
>  	bool poll_non_ioq_intr;
>  	/* Work entry to poll non-ioq interrupts */
>  	struct delayed_work intr_poll_task;
> +
> +	/* Firmware heartbeat timer */
> +	struct timer_list hb_timer;
> +	/* Firmware heartbeat miss count tracked by timer */
> +	atomic_t hb_miss_cnt;
> +	/* Task to reset device on heartbeat miss */
> +	struct delayed_work hb_task;
>  };
>  
>  static inline u16 OCTEP_MAJOR_REV(struct octep_device *oct)
> diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> index 0466fd9a002d..b25c3093dc7b 100644
> --- a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> @@ -367,5 +367,7 @@
>  
>  /* bit 0 for control mbox interrupt */
>  #define CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX	BIT_ULL(0)
> +/* bit 1 for firmware heartbeat interrupt */
> +#define CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT	BIT_ULL(1)
>  
>  #endif /* _OCTEP_REGS_CN9K_PF_H_ */
> -- 
> 2.36.0
>
Veerasenareddy Burru March 23, 2023, 6:14 p.m. UTC | #2
> -----Original Message-----
> From: Leon Romanovsky <leon@kernel.org>
> Sent: Thursday, March 23, 2023 3:47 AM
> To: Veerasenareddy Burru <vburru@marvell.com>
> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Abhijit Ayarekar
> <aayarekar@marvell.com>; Sathesh B Edara <sedara@marvell.com>;
> Satananda Burla <sburla@marvell.com>; linux-doc@vger.kernel.org; David S.
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>;
> Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>
> Subject: [EXT] Re: [PATCH net-next v4 8/8] octeon_ep: add heartbeat
> monitor
> 
> External Email
> 
> ----------------------------------------------------------------------
> On Wed, Mar 22, 2023 at 02:19:57AM -0700, Veerasenareddy Burru wrote:
> > Monitor periodic heartbeat messages from device firmware.
> > Presence of heartbeat indicates the device is active and running.
> > If the heartbeat is missed for configured interval indicates firmware
> > has crashed and device is unusable; in this case, PF driver stops and
> > uninitialize the device.
> >
> > Signed-off-by: Veerasenareddy Burru <vburru@marvell.com>
> > Signed-off-by: Abhijit Ayarekar <aayarekar@marvell.com>
> > ---
> > v3 -> v4:
> >  * 0007-xxx.patch in v3 is 0008-xxx.patch in v4.
> >
> > v2 -> v3:
> >  * 0009-xxx.patch in v2 is now 0007-xxx.patch in v3 due to
> >    0007 and 0008.patch from v2 are removed in v3.
> >
> > v1 -> v2:
> >  * no change
> >
> >  .../marvell/octeon_ep/octep_cn9k_pf.c         |  9 ++++
> >  .../ethernet/marvell/octeon_ep/octep_config.h |  6 +++
> >  .../ethernet/marvell/octeon_ep/octep_main.c   | 45
> ++++++++++++++++++-
> >  .../ethernet/marvell/octeon_ep/octep_main.h   |  7 +++
> >  .../marvell/octeon_ep/octep_regs_cn9k_pf.h    |  2 +
> >  5 files changed, 67 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> > b/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> > index e2503c9bc8a1..90c3a419932d 100644
> > --- a/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> > +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
> > @@ -16,6 +16,9 @@
> >  #define CTRL_MBOX_MAX_PF	128
> >  #define CTRL_MBOX_SZ		((size_t)(0x400000 /
> CTRL_MBOX_MAX_PF))
> >
> > +#define FW_HB_INTERVAL_IN_SECS		1
> > +#define FW_HB_MISS_COUNT		10
> > +
> >  /* Names of Hardware non-queue generic interrupts */  static char
> > *cn93_non_ioq_msix_names[] = {
> >  	"epf_ire_rint",
> > @@ -249,6 +252,10 @@ static void octep_init_config_cn93_pf(struct
> octep_device *oct)
> >  	conf->ctrl_mbox_cfg.barmem_addr = (void __iomem *)oct-
> >mmio[2].hw_addr +
> >  					   (0x400000ull * 7) +
> >  					   (link * CTRL_MBOX_SZ);
> > +
> > +	conf->hb_interval = FW_HB_INTERVAL_IN_SECS;
> > +	conf->max_hb_miss_cnt = FW_HB_MISS_COUNT;
> > +
> >  }
> >
> >  /* Setup registers for a hardware Tx Queue  */ @@ -383,6 +390,8 @@
> > static bool octep_poll_non_ioq_interrupts_cn93_pf(struct octep_device
> *oct)
> >  		octep_write_csr64(oct, CN93_SDP_EPF_OEI_RINT, reg0);
> >  		if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX)
> >  			queue_work(octep_wq, &oct->ctrl_mbox_task);
> > +		else if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT)
> > +			atomic_set(&oct->hb_miss_cnt, 0);
> >
> >  		handled = true;
> >  	}
> > diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> > b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> > index f208f3f9a447..df7cd39d9fce 100644
> > --- a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> > +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
> > @@ -200,5 +200,11 @@ struct octep_config {
> >
> >  	/* ctrl mbox config */
> >  	struct octep_ctrl_mbox_config ctrl_mbox_cfg;
> > +
> > +	/* Configured maximum heartbeat miss count */
> > +	u32 max_hb_miss_cnt;
> > +
> > +	/* Configured firmware heartbeat interval in secs */
> > +	u32 hb_interval;
> >  };
> >  #endif /* _OCTEP_CONFIG_H_ */
> > diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> > b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> > index ba0d5fe3081d..415dd06ff344 100644
> > --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> > +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
> > @@ -901,6 +901,38 @@ static void octep_intr_poll_task(struct work_struct
> *work)
> >
> msecs_to_jiffies(OCTEP_INTR_POLL_TIME_MSECS));
> >  }
> >
> > +/**
> > + * octep_hb_timeout_task - work queue task to check firmware
> heartbeat.
> > + *
> > + * @work: pointer to hb work_struct
> > + *
> > + * Check for heartbeat miss count. Uninitialize oct device if miss
> > +count
> > + * exceeds configured max heartbeat miss count.
> > + *
> > + **/
> > +static void octep_hb_timeout_task(struct work_struct *work) {
> > +	struct octep_device *oct = container_of(work, struct octep_device,
> > +						hb_task.work);
> > +
> > +	int miss_cnt;
> > +
> > +	atomic_inc(&oct->hb_miss_cnt);
> > +	miss_cnt = atomic_read(&oct->hb_miss_cnt);
> 
> miss_cnt = atomic_inc_return(&oct->hb_miss_cnt);
> 

Thanks for the feedback. Will fix it.

> > +	if (miss_cnt < oct->conf->max_hb_miss_cnt) {
> 
> How is this heartbeat working? You increment on every entry to
> octep_hb_timeout_task(), After max_hb_miss_cnt invocations, you will stop
> your device.
> 
> Thanks
> 

Yes, device will be stopped after max_hb_miss_cnt heartbeats are missed.

> > +		queue_delayed_work(octep_wq, &oct->hb_task,
> > +				   msecs_to_jiffies(oct->conf->hb_interval *
> 1000));
> > +		return;
> > +	}
> > +
> > +	dev_err(&oct->pdev->dev, "Missed %u heartbeats. Uninitializing\n",
> > +		miss_cnt);
> > +	rtnl_lock();
> > +	if (netif_running(oct->netdev))
> > +		octep_stop(oct->netdev);
> > +	rtnl_unlock();
> > +}
> > +
> >  /**
> >   * octep_ctrl_mbox_task - work queue task to handle ctrl mbox messages.
> >   *
> > @@ -938,7 +970,7 @@ static const char *octep_devid_to_str(struct
> > octep_device *oct)  int octep_device_setup(struct octep_device *oct)
> > {
> >  	struct pci_dev *pdev = oct->pdev;
> > -	int i;
> > +	int i, ret;
> >
> >  	/* allocate memory for oct->conf */
> >  	oct->conf = kzalloc(sizeof(*oct->conf), GFP_KERNEL); @@ -973,7
> > +1005,15 @@ int octep_device_setup(struct octep_device *oct)
> >
> >  	oct->pkind = CFG_GET_IQ_PKIND(oct->conf);
> >
> > -	return octep_ctrl_net_init(oct);
> > +	ret = octep_ctrl_net_init(oct);
> > +	if (ret)
> > +		return ret;
> > +
> > +	atomic_set(&oct->hb_miss_cnt, 0);
> > +	INIT_DELAYED_WORK(&oct->hb_task, octep_hb_timeout_task);
> > +	queue_delayed_work(octep_wq, &oct->hb_task,
> > +			   msecs_to_jiffies(oct->conf->hb_interval * 1000));
> > +	return 0;
> >
> >  unsupported_dev:
> >  	for (i = 0; i < OCTEP_MMIO_REGIONS; i++) @@ -1002,6 +1042,7 @@
> > static void octep_device_cleanup(struct octep_device *oct)
> >  	}
> >
> >  	octep_ctrl_net_uninit(oct);
> > +	cancel_delayed_work_sync(&oct->hb_task);
> >
> >  	oct->hw_ops.soft_reset(oct);
> >  	for (i = 0; i < OCTEP_MMIO_REGIONS; i++) { diff --git
> > a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> > b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> > index 836d990ba3fa..e0907a719133 100644
> > --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> > +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
> > @@ -280,6 +280,13 @@ struct octep_device {
> >  	bool poll_non_ioq_intr;
> >  	/* Work entry to poll non-ioq interrupts */
> >  	struct delayed_work intr_poll_task;
> > +
> > +	/* Firmware heartbeat timer */
> > +	struct timer_list hb_timer;
> > +	/* Firmware heartbeat miss count tracked by timer */
> > +	atomic_t hb_miss_cnt;
> > +	/* Task to reset device on heartbeat miss */
> > +	struct delayed_work hb_task;
> >  };
> >
> >  static inline u16 OCTEP_MAJOR_REV(struct octep_device *oct) diff
> > --git a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> > b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> > index 0466fd9a002d..b25c3093dc7b 100644
> > --- a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> > +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
> > @@ -367,5 +367,7 @@
> >
> >  /* bit 0 for control mbox interrupt */
> >  #define CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX	BIT_ULL(0)
> > +/* bit 1 for firmware heartbeat interrupt */
> > +#define CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT	BIT_ULL(1)
> >
> >  #endif /* _OCTEP_REGS_CN9K_PF_H_ */
> > --
> > 2.36.0
> >
Leon Romanovsky March 29, 2023, 7:33 a.m. UTC | #3
On Thu, Mar 23, 2023 at 06:14:10PM +0000, Veerasenareddy Burru wrote:
> 
> 
> > -----Original Message-----
> > From: Leon Romanovsky <leon@kernel.org>
> > Sent: Thursday, March 23, 2023 3:47 AM
> > To: Veerasenareddy Burru <vburru@marvell.com>
> > Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Abhijit Ayarekar
> > <aayarekar@marvell.com>; Sathesh B Edara <sedara@marvell.com>;
> > Satananda Burla <sburla@marvell.com>; linux-doc@vger.kernel.org; David S.
> > Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>;
> > Jakub Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>
> > Subject: [EXT] Re: [PATCH net-next v4 8/8] octeon_ep: add heartbeat
> > monitor
> > 
> > External Email
> > 
> > ----------------------------------------------------------------------
> > On Wed, Mar 22, 2023 at 02:19:57AM -0700, Veerasenareddy Burru wrote:
> > > Monitor periodic heartbeat messages from device firmware.
> > > Presence of heartbeat indicates the device is active and running.
> > > If the heartbeat is missed for configured interval indicates firmware
> > > has crashed and device is unusable; in this case, PF driver stops and
> > > uninitialize the device.
> > >
> > > Signed-off-by: Veerasenareddy Burru <vburru@marvell.com>
> > > Signed-off-by: Abhijit Ayarekar <aayarekar@marvell.com>
> > > ---
> > > v3 -> v4:
> > >  * 0007-xxx.patch in v3 is 0008-xxx.patch in v4.
> > >
> > > v2 -> v3:
> > >  * 0009-xxx.patch in v2 is now 0007-xxx.patch in v3 due to
> > >    0007 and 0008.patch from v2 are removed in v3.
> > >
> > > v1 -> v2:
> > >  * no change

<...>

> > > +	struct octep_device *oct = container_of(work, struct octep_device,
> > > +						hb_task.work);
> > > +
> > > +	int miss_cnt;
> > > +
> > > +	atomic_inc(&oct->hb_miss_cnt);
> > > +	miss_cnt = atomic_read(&oct->hb_miss_cnt);
> > 
> > miss_cnt = atomic_inc_return(&oct->hb_miss_cnt);
> > 
> 
> Thanks for the feedback. Will fix it.
> 
> > > +	if (miss_cnt < oct->conf->max_hb_miss_cnt) {
> > 
> > How is this heartbeat working? You increment on every entry to
> > octep_hb_timeout_task(), After max_hb_miss_cnt invocations, you will stop
> > your device.
> > 
> > Thanks
> > 
> 
> Yes, device will be stopped after max_hb_miss_cnt heartbeats are missed.

If I read code correctly, device will stop after octep_hb_timeout_task()
calls which happens every msecs_to_jiffies(oct->conf->hb_interval * 1000.
You don't cancel/resechdule job if timeout doesn't happen.

Thanks

> 
> > > +		queue_delayed_work(octep_wq, &oct->hb_task,
> > > +				   msecs_to_jiffies(oct->conf->hb_interval *
> > 1000));
diff mbox series

Patch

diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c b/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
index e2503c9bc8a1..90c3a419932d 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_cn9k_pf.c
@@ -16,6 +16,9 @@ 
 #define CTRL_MBOX_MAX_PF	128
 #define CTRL_MBOX_SZ		((size_t)(0x400000 / CTRL_MBOX_MAX_PF))
 
+#define FW_HB_INTERVAL_IN_SECS		1
+#define FW_HB_MISS_COUNT		10
+
 /* Names of Hardware non-queue generic interrupts */
 static char *cn93_non_ioq_msix_names[] = {
 	"epf_ire_rint",
@@ -249,6 +252,10 @@  static void octep_init_config_cn93_pf(struct octep_device *oct)
 	conf->ctrl_mbox_cfg.barmem_addr = (void __iomem *)oct->mmio[2].hw_addr +
 					   (0x400000ull * 7) +
 					   (link * CTRL_MBOX_SZ);
+
+	conf->hb_interval = FW_HB_INTERVAL_IN_SECS;
+	conf->max_hb_miss_cnt = FW_HB_MISS_COUNT;
+
 }
 
 /* Setup registers for a hardware Tx Queue  */
@@ -383,6 +390,8 @@  static bool octep_poll_non_ioq_interrupts_cn93_pf(struct octep_device *oct)
 		octep_write_csr64(oct, CN93_SDP_EPF_OEI_RINT, reg0);
 		if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX)
 			queue_work(octep_wq, &oct->ctrl_mbox_task);
+		else if (reg0 & CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT)
+			atomic_set(&oct->hb_miss_cnt, 0);
 
 		handled = true;
 	}
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
index f208f3f9a447..df7cd39d9fce 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_config.h
@@ -200,5 +200,11 @@  struct octep_config {
 
 	/* ctrl mbox config */
 	struct octep_ctrl_mbox_config ctrl_mbox_cfg;
+
+	/* Configured maximum heartbeat miss count */
+	u32 max_hb_miss_cnt;
+
+	/* Configured firmware heartbeat interval in secs */
+	u32 hb_interval;
 };
 #endif /* _OCTEP_CONFIG_H_ */
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
index ba0d5fe3081d..415dd06ff344 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c
@@ -901,6 +901,38 @@  static void octep_intr_poll_task(struct work_struct *work)
 			   msecs_to_jiffies(OCTEP_INTR_POLL_TIME_MSECS));
 }
 
+/**
+ * octep_hb_timeout_task - work queue task to check firmware heartbeat.
+ *
+ * @work: pointer to hb work_struct
+ *
+ * Check for heartbeat miss count. Uninitialize oct device if miss count
+ * exceeds configured max heartbeat miss count.
+ *
+ **/
+static void octep_hb_timeout_task(struct work_struct *work)
+{
+	struct octep_device *oct = container_of(work, struct octep_device,
+						hb_task.work);
+
+	int miss_cnt;
+
+	atomic_inc(&oct->hb_miss_cnt);
+	miss_cnt = atomic_read(&oct->hb_miss_cnt);
+	if (miss_cnt < oct->conf->max_hb_miss_cnt) {
+		queue_delayed_work(octep_wq, &oct->hb_task,
+				   msecs_to_jiffies(oct->conf->hb_interval * 1000));
+		return;
+	}
+
+	dev_err(&oct->pdev->dev, "Missed %u heartbeats. Uninitializing\n",
+		miss_cnt);
+	rtnl_lock();
+	if (netif_running(oct->netdev))
+		octep_stop(oct->netdev);
+	rtnl_unlock();
+}
+
 /**
  * octep_ctrl_mbox_task - work queue task to handle ctrl mbox messages.
  *
@@ -938,7 +970,7 @@  static const char *octep_devid_to_str(struct octep_device *oct)
 int octep_device_setup(struct octep_device *oct)
 {
 	struct pci_dev *pdev = oct->pdev;
-	int i;
+	int i, ret;
 
 	/* allocate memory for oct->conf */
 	oct->conf = kzalloc(sizeof(*oct->conf), GFP_KERNEL);
@@ -973,7 +1005,15 @@  int octep_device_setup(struct octep_device *oct)
 
 	oct->pkind = CFG_GET_IQ_PKIND(oct->conf);
 
-	return octep_ctrl_net_init(oct);
+	ret = octep_ctrl_net_init(oct);
+	if (ret)
+		return ret;
+
+	atomic_set(&oct->hb_miss_cnt, 0);
+	INIT_DELAYED_WORK(&oct->hb_task, octep_hb_timeout_task);
+	queue_delayed_work(octep_wq, &oct->hb_task,
+			   msecs_to_jiffies(oct->conf->hb_interval * 1000));
+	return 0;
 
 unsupported_dev:
 	for (i = 0; i < OCTEP_MMIO_REGIONS; i++)
@@ -1002,6 +1042,7 @@  static void octep_device_cleanup(struct octep_device *oct)
 	}
 
 	octep_ctrl_net_uninit(oct);
+	cancel_delayed_work_sync(&oct->hb_task);
 
 	oct->hw_ops.soft_reset(oct);
 	for (i = 0; i < OCTEP_MMIO_REGIONS; i++) {
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
index 836d990ba3fa..e0907a719133 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.h
@@ -280,6 +280,13 @@  struct octep_device {
 	bool poll_non_ioq_intr;
 	/* Work entry to poll non-ioq interrupts */
 	struct delayed_work intr_poll_task;
+
+	/* Firmware heartbeat timer */
+	struct timer_list hb_timer;
+	/* Firmware heartbeat miss count tracked by timer */
+	atomic_t hb_miss_cnt;
+	/* Task to reset device on heartbeat miss */
+	struct delayed_work hb_task;
 };
 
 static inline u16 OCTEP_MAJOR_REV(struct octep_device *oct)
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
index 0466fd9a002d..b25c3093dc7b 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h
@@ -367,5 +367,7 @@ 
 
 /* bit 0 for control mbox interrupt */
 #define CN93_SDP_EPF_OEI_RINT_DATA_BIT_MBOX	BIT_ULL(0)
+/* bit 1 for firmware heartbeat interrupt */
+#define CN93_SDP_EPF_OEI_RINT_DATA_BIT_HBEAT	BIT_ULL(1)
 
 #endif /* _OCTEP_REGS_CN9K_PF_H_ */