@@ -1052,12 +1052,22 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 cpu_q_id;
if (!prop->cpucp_info.eq_health_check_supported)
return true;
if (!hdev->eq_heartbeat_received) {
+ cpu_q_id = hdev->heartbeat_debug_info.cpu_queue_id;
+
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
+
+ dev_err(hdev->dev, "Heartbeat events counter: %u, Q_PI: %u, Q_CI: %u, EQ CI: %u, EQ prev: %u\n",
+ hdev->heartbeat_debug_info.heartbeat_event_counter,
+ hdev->kernel_queues[cpu_q_id].pi,
+ atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
+ hdev->event_queue.ci,
+ hdev->event_queue.prev_eqe_index);
return false;
}
@@ -1138,6 +1148,8 @@ static int device_late_init(struct hl_device *hdev)
hdev->high_pll = hdev->asic_prop.high_pll;
if (hdev->heartbeat) {
+ hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+
/*
* Before scheduling the heartbeat driver will check if eq event has received.
* for the first schedule we need to set the indication as true then for the next
@@ -71,7 +71,7 @@ struct hl_fpriv;
#define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */
-#define HL_HEARTBEAT_PER_USEC 5000000 /* 5 s */
+#define HL_HEARTBEAT_PER_USEC 10000000 /* 10 s */
#define HL_PLL_LOW_JOB_FREQ_USEC 5000000 /* 5 s */
@@ -3174,6 +3174,16 @@ struct hl_reset_info {
u8 watchdog_active;
};
+/**
+ * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
+ * @heartbeat_event_counter: number of heartbeat events received.
+ * @cpu_queue_id: used to read the queue pi/ci
+ */
+struct eq_heartbeat_debug_info {
+ u32 heartbeat_event_counter;
+ u32 cpu_queue_id;
+};
+
/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -3262,6 +3272,7 @@ struct hl_reset_info {
* @clk_throttling: holds information about current/previous clock throttling events
* @captured_err_info: holds information about errors.
* @reset_info: holds current device reset information.
+ * @heartbeat_debug_info: counters used to debug heartbeat failures.
* @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
* @stream_master_qid_arr: pointer to array with QIDs of master streams.
* @fw_inner_major_ver: the major of current loaded preboot inner version.
@@ -3452,6 +3463,8 @@ struct hl_device {
struct hl_reset_info reset_info;
+ struct eq_heartbeat_debug_info heartbeat_debug_info;
+
cpumask_t irq_affinity_mask;
u32 *stream_master_qid_arr;
@@ -3796,6 +3796,8 @@ static int gaudi2_sw_init(struct hl_device *hdev)
if (rc)
goto special_blocks_free;
+ hdev->heartbeat_debug_info.cpu_queue_id = GAUDI2_QUEUE_ID_CPU_PQ;
+
return 0;
special_blocks_free:
@@ -9777,6 +9779,7 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
{
+ hdev->heartbeat_debug_info.heartbeat_event_counter++;
hdev->eq_heartbeat_received = true;
}