diff mbox series

[net-next,8/9] bnxt_en: implement hw health reporter

Message ID 1646470482-13763-9-git-send-email-michael.chan@broadcom.com (mailing list archive)
State Accepted
Commit bafed3f231f7037ce881de2278c14a679ee9c937
Delegated to: Netdev Maintainers
Headers show
Series bnxt_en: Updates. | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2 this patch: 2
netdev/cc_maintainers success CCed 3 of 3 maintainers
netdev/build_clang success Errors and warnings before: 0 this patch: 0
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2 this patch: 2
netdev/checkpatch warning WARNING: line length of 86 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Michael Chan March 5, 2022, 8:54 a.m. UTC
From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>

This reporter will report NVM errors which are non-fatal.
When we receive these NVM error events, we'll report it
through this new hw health reporter.

Reviewed-by: Edwin Peer <edwin.peer@broadcom.com>
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 19 +++++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     | 33 +++++++++
 .../net/ethernet/broadcom/bnxt/bnxt_devlink.c | 73 +++++++++++++++++++
 .../net/ethernet/broadcom/bnxt/bnxt_devlink.h |  1 +
 4 files changed, 126 insertions(+)

Comments

Jakub Kicinski March 7, 2022, 10:21 p.m. UTC | #1
On Sat,  5 Mar 2022 03:54:41 -0500 Michael Chan wrote:
> +static int bnxt_hw_recover(struct devlink_health_reporter *reporter,
> +			   void *priv_ctx,
> +			   struct netlink_ext_ack *extack)
> +{
> +	struct bnxt *bp = devlink_health_reporter_priv(reporter);
> +	struct bnxt_hw_health *hw_health = &bp->hw_health;
> +
> +	hw_health->synd = BNXT_HW_STATUS_HEALTHY;
> +	return 0;
> +}

This seems like a very weird recovery function.

I guess the FW automatically does auto-recovery and the driver 
has no control over it?
diff mbox series

Patch

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 2de02950086f..63b8fc4f9d42 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2061,6 +2061,22 @@  static void bnxt_event_error_report(struct bnxt *bp, u32 data1, u32 data2)
 	case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD:
 		netdev_warn(bp->dev, "One or more MMIO doorbells dropped by the device!\n");
 		break;
+	case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM: {
+		struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+		hw_health->nvm_err_address = EVENT_DATA2_NVM_ERR_ADDR(data2);
+		if (EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)) {
+			hw_health->synd = BNXT_HW_STATUS_NVM_WRITE_ERR;
+			hw_health->nvm_write_errors++;
+		} else if (EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)) {
+			hw_health->synd = BNXT_HW_STATUS_NVM_ERASE_ERR;
+			hw_health->nvm_erase_errors++;
+		} else {
+			hw_health->synd = BNXT_HW_STATUS_NVM_UNKNOWN_ERR;
+		}
+		set_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event);
+		break;
+	}
 	default:
 		netdev_err(bp->dev, "FW reported unknown error type %u\n",
 			   err_type);
@@ -11887,6 +11903,9 @@  static void bnxt_sp_task(struct work_struct *work)
 	if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event))
 		bnxt_fw_echo_reply(bp);
 
+	if (test_and_clear_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event))
+		bnxt_devlink_health_hw_report(bp);
+
 	/* These functions below will clear BNXT_STATE_IN_SP_TASK.  They
 	 * must be the last functions to be called before exiting.
 	 */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 447a9406b8a2..fa0df43ddc1a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -516,6 +516,21 @@  struct rx_tpa_end_cmp_ext {
 	  ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_MASK) >>\
 	 ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_SFT)
 
+#define EVENT_DATA2_NVM_ERR_ADDR(data2)					\
+	(((data2) &							\
+	  ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_MASK) >>\
+	 ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_SFT)
+
+#define EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)				\
+	(((data1) &							\
+	  ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\
+	 ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_WRITE)
+
+#define EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)				\
+	(((data1) &							\
+	  ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\
+	 ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE)
+
 struct nqe_cn {
 	__le16	type;
 	#define NQ_CN_TYPE_MASK           0x3fUL
@@ -1528,6 +1543,21 @@  struct bnxt_ctx_mem_info {
 	struct bnxt_mem_init	mem_init[BNXT_CTX_MEM_INIT_MAX];
 };
 
+enum bnxt_hw_err {
+	BNXT_HW_STATUS_HEALTHY		= 0x0,
+	BNXT_HW_STATUS_NVM_WRITE_ERR	= 0x1,
+	BNXT_HW_STATUS_NVM_ERASE_ERR	= 0x2,
+	BNXT_HW_STATUS_NVM_UNKNOWN_ERR	= 0x3,
+};
+
+struct bnxt_hw_health {
+	u32 nvm_err_address;
+	u32 nvm_write_errors;
+	u32 nvm_erase_errors;
+	u8 synd;
+	struct devlink_health_reporter *hw_reporter;
+};
+
 enum bnxt_health_severity {
 	SEVERITY_NORMAL = 0,
 	SEVERITY_WARNING,
@@ -2045,6 +2075,7 @@  struct bnxt {
 #define BNXT_FW_EXCEPTION_SP_EVENT	19
 #define BNXT_LINK_CFG_CHANGE_SP_EVENT	21
 #define BNXT_FW_ECHO_REQUEST_SP_EVENT	23
+#define BNXT_FW_NVM_ERR_SP_EVENT	25
 
 	struct delayed_work	fw_reset_task;
 	int			fw_reset_state;
@@ -2145,6 +2176,8 @@  struct bnxt {
 	struct dentry		*debugfs_pdev;
 	struct device		*hwmon_dev;
 	enum board_idx		board_idx;
+
+	struct bnxt_hw_health	hw_health;
 };
 
 #define BNXT_NUM_RX_RING_STATS			8
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 0c17f90d44a2..a802bbda1c27 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -241,6 +241,69 @@  static const struct devlink_health_reporter_ops bnxt_dl_fw_reporter_ops = {
 	.recover = bnxt_fw_recover,
 };
 
+static int bnxt_hw_recover(struct devlink_health_reporter *reporter,
+			   void *priv_ctx,
+			   struct netlink_ext_ack *extack)
+{
+	struct bnxt *bp = devlink_health_reporter_priv(reporter);
+	struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+	hw_health->synd = BNXT_HW_STATUS_HEALTHY;
+	return 0;
+}
+
+static const char *hw_err_str(u8 synd)
+{
+	switch (synd) {
+	case BNXT_HW_STATUS_HEALTHY:
+		return "healthy";
+	case BNXT_HW_STATUS_NVM_WRITE_ERR:
+		return "nvm write error";
+	case BNXT_HW_STATUS_NVM_ERASE_ERR:
+		return "nvm erase error";
+	case BNXT_HW_STATUS_NVM_UNKNOWN_ERR:
+		return "unrecognized nvm error";
+	default:
+		return "unknown hw error";
+	}
+}
+
+static int bnxt_hw_diagnose(struct devlink_health_reporter *reporter,
+			    struct devlink_fmsg *fmsg,
+			    struct netlink_ext_ack *extack)
+{
+	struct bnxt *bp = devlink_health_reporter_priv(reporter);
+	struct bnxt_hw_health *h = &bp->hw_health;
+	int rc;
+
+	rc = devlink_fmsg_string_pair_put(fmsg, "Status", hw_err_str(h->synd));
+	if (rc)
+		return rc;
+	rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_write_errors", h->nvm_write_errors);
+	if (rc)
+		return rc;
+	rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_erase_errors", h->nvm_erase_errors);
+	if (rc)
+		return rc;
+	return 0;
+}
+
+void bnxt_devlink_health_hw_report(struct bnxt *bp)
+{
+	struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+	netdev_warn(bp->dev, "%s reported at address 0x%x\n", hw_err_str(hw_health->synd),
+		    hw_health->nvm_err_address);
+
+	devlink_health_report(hw_health->hw_reporter, hw_err_str(hw_health->synd), NULL);
+}
+
+static const struct devlink_health_reporter_ops bnxt_dl_hw_reporter_ops = {
+	.name = "hw",
+	.diagnose = bnxt_hw_diagnose,
+	.recover = bnxt_hw_recover,
+};
+
 static struct devlink_health_reporter *
 __bnxt_dl_reporter_create(struct bnxt *bp,
 			  const struct devlink_health_reporter_ops *ops)
@@ -260,6 +323,10 @@  __bnxt_dl_reporter_create(struct bnxt *bp,
 void bnxt_dl_fw_reporters_create(struct bnxt *bp)
 {
 	struct bnxt_fw_health *fw_health = bp->fw_health;
+	struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+	if (!hw_health->hw_reporter)
+		hw_health->hw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_hw_reporter_ops);
 
 	if (fw_health && !fw_health->fw_reporter)
 		fw_health->fw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_fw_reporter_ops);
@@ -268,6 +335,12 @@  void bnxt_dl_fw_reporters_create(struct bnxt *bp)
 void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
 {
 	struct bnxt_fw_health *fw_health = bp->fw_health;
+	struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+	if (hw_health->hw_reporter) {
+		devlink_health_reporter_destroy(hw_health->hw_reporter);
+		hw_health->hw_reporter = NULL;
+	}
 
 	if (fw_health && fw_health->fw_reporter) {
 		devlink_health_reporter_destroy(fw_health->fw_reporter);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
index b8105065367b..056962e4b177 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
@@ -74,6 +74,7 @@  enum bnxt_dl_version_type {
 void bnxt_devlink_health_fw_report(struct bnxt *bp);
 void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy);
 void bnxt_dl_health_fw_recovery_done(struct bnxt *bp);
+void bnxt_devlink_health_hw_report(struct bnxt *bp);
 void bnxt_dl_fw_reporters_create(struct bnxt *bp);
 void bnxt_dl_fw_reporters_destroy(struct bnxt *bp);
 int bnxt_dl_register(struct bnxt *bp);