diff mbox series

[net-next,4/4] mlxsw: core: Extend devlink health reporter with new events and parameters

Message ID 20211220105614.68622-5-idosch@nvidia.com (mailing list archive)
State Accepted
Delegated to: Netdev Maintainers
Headers show
Series mlxsw: devlink health reporter extensions | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply success Patch already applied to net-next

Commit Message

Ido Schimmel Dec. 20, 2021, 10:56 a.m. UTC
From: Danielle Ratson <danieller@nvidia.com>

Extend the devlink health reporter registered by mlxsw to report new
health events and their related parameters. These are meant to aid in
debugging of hardware / firmware issues.

Beside the test event ('MLXSW_REG_MFDE_EVENT_ID_TEST') that is triggered
following the devlink health 'test' sub-command, the new events are used
to report the triggering of asserts in firmware code
('MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT') and hardware issues
('MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE').

Each event is accompanied with a severity parameter and per-event
parameters that are meant to help root cause the detected issue.

Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 131 +++++++++++++++++++++
 1 file changed, 131 insertions(+)
diff mbox series

Patch

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index d9f12d9cd0ff..866b9357939b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1708,12 +1708,93 @@  static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg,
 static const struct mlxsw_listener mlxsw_core_health_listener =
 	MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE);
 
+static int
+mlxsw_core_health_fw_fatal_dump_fatal_cause(const char *mfde_pl,
+					    struct devlink_fmsg *fmsg)
+{
+	u32 val, tile_v;
+	int err;
+
+	val = mlxsw_reg_mfde_fatal_cause_id_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "cause_id", val);
+	if (err)
+		return err;
+	tile_v = mlxsw_reg_mfde_fatal_cause_tile_v_get(mfde_pl);
+	if (tile_v) {
+		val = mlxsw_reg_mfde_fatal_cause_tile_index_get(mfde_pl);
+		err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_core_health_fw_fatal_dump_fw_assert(const char *mfde_pl,
+					  struct devlink_fmsg *fmsg)
+{
+	u32 val, tile_v;
+	int err;
+
+	val = mlxsw_reg_mfde_fw_assert_var0_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "var0", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_var1_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "var1", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_var2_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "var2", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_var3_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "var3", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_var4_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "var4", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_existptr_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "existptr", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_callra_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "callra", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_fw_assert_oe_get(mfde_pl);
+	err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
+	if (err)
+		return err;
+	tile_v = mlxsw_reg_mfde_fw_assert_tile_v_get(mfde_pl);
+	if (tile_v) {
+		val = mlxsw_reg_mfde_fw_assert_tile_index_get(mfde_pl);
+		err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
+		if (err)
+			return err;
+	}
+	val = mlxsw_reg_mfde_fw_assert_ext_synd_get(mfde_pl);
+	err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", val);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static int
 mlxsw_core_health_fw_fatal_dump_kvd_im_stop(const char *mfde_pl,
 					    struct devlink_fmsg *fmsg)
 {
 	u32 val;
+	int err;
 
+	val = mlxsw_reg_mfde_kvd_im_stop_oe_get(mfde_pl);
+	err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
+	if (err)
+		return err;
 	val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl);
 	return devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val);
 }
@@ -1727,6 +1808,10 @@  mlxsw_core_health_fw_fatal_dump_crspace_to(const char *mfde_pl,
 
 	val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl);
 	err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val);
+	if (err)
+		return err;
+	val = mlxsw_reg_mfde_crspace_to_oe_get(mfde_pl);
+	err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
 	if (err)
 		return err;
 	val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl);
@@ -1774,6 +1859,15 @@  static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
 	case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
 		val_str = "KVD insertion machine stopped";
 		break;
+	case MLXSW_REG_MFDE_EVENT_ID_TEST:
+		val_str = "Test";
+		break;
+	case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
+		val_str = "FW assert";
+		break;
+	case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
+		val_str = "Fatal cause";
+		break;
 	default:
 		val_str = NULL;
 	}
@@ -1782,6 +1876,38 @@  static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
 		if (err)
 			return err;
 	}
+
+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_arr_pair_nest_start(fmsg, "severity");
+	if (err)
+		return err;
+
+	val = mlxsw_reg_mfde_severity_get(mfde_pl);
+	err = devlink_fmsg_u8_pair_put(fmsg, "id", val);
+	if (err)
+		return err;
+	switch (val) {
+	case MLXSW_REG_MFDE_SEVERITY_FATL:
+		val_str = "Fatal";
+		break;
+	case MLXSW_REG_MFDE_SEVERITY_NRML:
+		val_str = "Normal";
+		break;
+	case MLXSW_REG_MFDE_SEVERITY_INTR:
+		val_str = "Debug";
+		break;
+	default:
+		val_str = NULL;
+	}
+	if (val_str) {
+		err = devlink_fmsg_string_pair_put(fmsg, "desc", val_str);
+		if (err)
+			return err;
+	}
+
 	err = devlink_fmsg_arr_pair_nest_end(fmsg);
 	if (err)
 		return err;
@@ -1840,6 +1966,11 @@  static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
 	case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
 		return mlxsw_core_health_fw_fatal_dump_kvd_im_stop(mfde_pl,
 								   fmsg);
+	case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
+		return mlxsw_core_health_fw_fatal_dump_fw_assert(mfde_pl, fmsg);
+	case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
+		return mlxsw_core_health_fw_fatal_dump_fatal_cause(mfde_pl,
+								   fmsg);
 	}
 
 	return 0;