diff mbox series

[rdma-core,13/27] mlx5: VFIO poll_health support

Message ID 20210720081647.1980-14-yishaih@nvidia.com (mailing list archive)
State Not Applicable
Headers show
Series Introduce mlx5 user space driver over VFIO | expand

Commit Message

Yishai Hadas July 20, 2021, 8:16 a.m. UTC
From: Mark Zhang <markzhang@nvidia.com>

Add firmware health polling support in vfio driver.

Such a case is not expected and we refer it as some fatal error in the
firmware that should be avoided/fixed.

The health buffer check is triggered by the application upon its call to
mlx5dv_vfio_process_events().

Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 providers/mlx5/mlx5_vfio.c | 168 +++++++++++++++++++++++++++++++++++++++++++++
 providers/mlx5/mlx5_vfio.h |  10 ++-
 2 files changed, 177 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c
index 85ee25b..c37358c 100644
--- a/providers/mlx5/mlx5_vfio.c
+++ b/providers/mlx5/mlx5_vfio.c
@@ -22,6 +22,8 @@ 
 #include <poll.h>
 #include <util/mmio.h>
 
+#include <ccan/array_size.h>
+
 #include "mlx5dv.h"
 #include "mlx5_vfio.h"
 #include "mlx5.h"
@@ -1910,6 +1912,7 @@  enum mlx5_cmd_addr_l_sz_offset {
 
 enum {
 	MLX5_NIC_IFC_DISABLED = 1,
+	MLX5_NIC_IFC_SW_RESET = 7,
 };
 
 static uint8_t mlx5_vfio_get_nic_state(struct mlx5_vfio_context *ctx)
@@ -1978,6 +1981,169 @@  static int mlx5_vfio_teardown_hca(struct mlx5_vfio_context *ctx)
 	return mlx5_vfio_teardown_hca_regular(ctx);
 }
 
+static bool sensor_pci_not_working(struct mlx5_init_seg *init_seg)
+{
+	/* Offline PCI reads return 0xffffffff */
+	return (be32toh(mmio_read32_be(&init_seg->health.fw_ver)) == 0xffffffff);
+}
+
+enum mlx5_fatal_assert_bit_offsets {
+	MLX5_RFR_OFFSET = 31,
+};
+
+static bool sensor_fw_synd_rfr(struct mlx5_init_seg *init_seg)
+{
+	uint32_t rfr = be32toh(mmio_read32_be(&init_seg->health.rfr)) >> MLX5_RFR_OFFSET;
+	uint8_t synd = mmio_read8(&init_seg->health.synd);
+
+	return (rfr && synd);
+}
+
+enum  {
+	MLX5_SENSOR_NO_ERR = 0,
+	MLX5_SENSOR_PCI_COMM_ERR = 1,
+	MLX5_SENSOR_NIC_DISABLED = 3,
+	MLX5_SENSOR_NIC_SW_RESET = 4,
+	MLX5_SENSOR_FW_SYND_RFR = 5,
+};
+
+static uint32_t mlx5_health_check_fatal_sensors(struct mlx5_vfio_context *ctx)
+{
+	if (sensor_pci_not_working(ctx->bar_map))
+		return MLX5_SENSOR_PCI_COMM_ERR;
+
+	if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_DISABLED)
+		return MLX5_SENSOR_NIC_DISABLED;
+
+	if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_SW_RESET)
+		return MLX5_SENSOR_NIC_SW_RESET;
+
+	if (sensor_fw_synd_rfr(ctx->bar_map))
+		return MLX5_SENSOR_FW_SYND_RFR;
+
+	return MLX5_SENSOR_NO_ERR;
+}
+
+enum {
+	MLX5_HEALTH_SYNDR_FW_ERR = 0x1,
+	MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7,
+	MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8,
+	MLX5_HEALTH_SYNDR_CRC_ERR = 0x9,
+	MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa,
+	MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb,
+	MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc,
+	MLX5_HEALTH_SYNDR_EQ_ERR = 0xd,
+	MLX5_HEALTH_SYNDR_EQ_INV = 0xe,
+	MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf,
+	MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10,
+};
+
+static const char *hsynd_str(u8 synd)
+{
+	switch (synd) {
+	case MLX5_HEALTH_SYNDR_FW_ERR:
+		return "firmware internal error";
+	case MLX5_HEALTH_SYNDR_IRISC_ERR:
+		return "irisc not responding";
+	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+		return "unrecoverable hardware error";
+	case MLX5_HEALTH_SYNDR_CRC_ERR:
+		return "firmware CRC error";
+	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+		return "ICM fetch PCI error";
+	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+		return "HW fatal error\n";
+	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+		return "async EQ buffer overrun";
+	case MLX5_HEALTH_SYNDR_EQ_ERR:
+		return "EQ error";
+	case MLX5_HEALTH_SYNDR_EQ_INV:
+		return "Invalid EQ referenced";
+	case MLX5_HEALTH_SYNDR_FFSER_ERR:
+		return "FFSER error";
+	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+		return "High temperature";
+	default:
+		return "unrecognized error";
+	}
+}
+
+static void print_health_info(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_init_seg *iseg = ctx->bar_map;
+	struct health_buffer *h = &iseg->health;
+	char fw_str[18] = {};
+	int i;
+
+	/* If the syndrome is 0, the device is OK and no need to print buffer */
+	if (!mmio_read8(&h->synd))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
+		mlx5_err(ctx->dbg_fp, "assert_var[%d] 0x%08x\n",
+			 i, be32toh(mmio_read32_be(h->assert_var + i)));
+
+	mlx5_err(ctx->dbg_fp, "assert_exit_ptr 0x%08x\n",
+		 be32toh(mmio_read32_be(&h->assert_exit_ptr)));
+	mlx5_err(ctx->dbg_fp, "assert_callra 0x%08x\n",
+		 be32toh(mmio_read32_be(&h->assert_callra)));
+	sprintf(fw_str, "%d.%d.%d",
+		be32toh(mmio_read32_be(&iseg->fw_rev)) & 0xffff,
+		be32toh(mmio_read32_be(&iseg->fw_rev)) >> 16,
+		be32toh(mmio_read32_be(&iseg->cmdif_rev_fw_sub)) & 0xffff);
+	mlx5_err(ctx->dbg_fp, "fw_ver %s\n", fw_str);
+	mlx5_err(ctx->dbg_fp, "hw_id 0x%08x\n", be32toh(mmio_read32_be(&h->hw_id)));
+	mlx5_err(ctx->dbg_fp, "irisc_index %d\n", mmio_read8(&h->irisc_index));
+	mlx5_err(ctx->dbg_fp, "synd 0x%x: %s\n", mmio_read8(&h->synd),
+		 hsynd_str(mmio_read8(&h->synd)));
+	mlx5_err(ctx->dbg_fp, "ext_synd 0x%04x\n",
+		 be16toh(mmio_read16_be(&h->ext_synd)));
+	mlx5_err(ctx->dbg_fp, "raw fw_ver 0x%08x\n",
+		 be32toh(mmio_read32_be(&iseg->fw_rev)));
+}
+
+static void mlx5_vfio_poll_health(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_vfio_health_state *hstate = &ctx->health_state;
+	uint32_t fatal_error, count;
+	struct timeval tv;
+	uint64_t time;
+	int ret;
+
+	ret = gettimeofday(&tv, NULL);
+	if (ret)
+		return;
+
+	time = (uint64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000;
+	if (time - hstate->prev_time < POLL_HEALTH_INTERVAL)
+		return;
+
+	fatal_error = mlx5_health_check_fatal_sensors(ctx);
+	if (fatal_error) {
+		mlx5_err(ctx->dbg_fp, "%s: Fatal error %u detected\n",
+			 __func__, fatal_error);
+		goto err;
+	}
+	count = be32toh(mmio_read32_be(&ctx->bar_map->health_counter)) & 0xffffff;
+	if (count == hstate->prev_count)
+		++hstate->miss_counter;
+	else
+		hstate->miss_counter = 0;
+
+	hstate->prev_time = time;
+	hstate->prev_count = count;
+	if (hstate->miss_counter == MAX_MISSES) {
+		mlx5_err(ctx->dbg_fp,
+			 "device's health compromised - reached miss count\n");
+		goto err;
+	}
+
+	return;
+err:
+	print_health_info(ctx);
+	abort();
+}
+
 static int mlx5_vfio_setup_function(struct mlx5_vfio_context *ctx)
 {
 	int err;
@@ -2232,6 +2398,8 @@  int mlx5dv_vfio_process_events(struct ibv_context *ibctx)
 	uint64_t u;
 	ssize_t s;
 
+	mlx5_vfio_poll_health(ctx);
+
 	/* read to re-arm the FD and process all existing events */
 	s = read(ctx->cmd_comp_fd, &u, sizeof(uint64_t));
 	if (s < 0 && errno != EAGAIN) {
diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h
index 8e240c8..296d6d1 100644
--- a/providers/mlx5/mlx5_vfio.h
+++ b/providers/mlx5/mlx5_vfio.h
@@ -240,6 +240,14 @@  struct mlx5_vfio_eqs_uar {
 	uint64_t iova;
 };
 
+#define POLL_HEALTH_INTERVAL 1000 /* ms */
+#define MAX_MISSES 3
+struct mlx5_vfio_health_state {
+	uint64_t prev_time; /* ms */
+	uint32_t prev_count;
+	uint32_t miss_counter;
+};
+
 struct mlx5_vfio_context {
 	struct verbs_context vctx;
 	int container_fd;
@@ -258,7 +266,7 @@  struct mlx5_vfio_context {
 		uint32_t hca_cur[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)];
 		uint32_t hca_max[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)];
 	} caps;
-
+	struct mlx5_vfio_health_state health_state;
 	struct mlx5_eq async_eq;
 	struct mlx5_vfio_eqs_uar eqs_uar;
 	pthread_mutex_t eq_lock;