@@ -1062,11 +1062,28 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (device_id == hdev->pdev->device);
}
+static void stringify_time_of_last_heartbeat(struct hl_device *hdev, char *time_str, size_t size,
+ bool is_pq_hb)
+{
+ time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts
+ : hdev->heartbeat_debug_info.last_eq_heartbeat_ts;
+ struct tm tm;
+
+ if (!seconds)
+ return;
+
+ time64_to_tm(seconds, 0, &tm);
+
+ snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)",
+ tm.tm_year + 1900, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+}
+
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1;
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ char pq_time_str[64] = "N/A", eq_time_str[64] = "N/A";
if (!prop->cpucp_info.eq_health_check_supported)
return true;
@@ -1074,13 +1091,17 @@ static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
if (!hdev->eq_heartbeat_received) {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
+ stringify_time_of_last_heartbeat(hdev, pq_time_str, sizeof(pq_time_str), true);
+ stringify_time_of_last_heartbeat(hdev, eq_time_str, sizeof(eq_time_str), false);
dev_err(hdev->dev,
- "Heartbeat events counter: %u, EQ CI: %u, PQ PI: %u, PQ CI: %u (%u)\n",
- heartbeat_debug_info->heartbeat_event_counter,
+ "EQ: {CI %u, HB counter %u, last HB time: %s}, PQ: {PI: %u, CI: %u (%u), last HB time: %s}\n",
hdev->event_queue.ci,
+ heartbeat_debug_info->heartbeat_event_counter,
+ eq_time_str,
hdev->kernel_queues[cpu_q_id].pi,
atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
- atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask);
+ atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
+ pq_time_str);
hl_eq_dump(hdev, &hdev->event_queue);
@@ -1562,12 +1583,19 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
}
}
+static void reset_heartbeat_debug_info(struct hl_device *hdev)
+{
+ hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0;
+ hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0;
+ hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+}
+
static inline void device_heartbeat_schedule(struct hl_device *hdev)
{
if (!hdev->heartbeat)
return;
- hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+ reset_heartbeat_debug_info(hdev);
/*
* Before scheduling the heartbeat driver will check if eq event has received.
@@ -2883,6 +2911,7 @@ void hl_set_irq_affinity(struct hl_device *hdev, int irq)
void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
{
hdev->heartbeat_debug_info.heartbeat_event_counter++;
+ hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds();
hdev->eq_heartbeat_received = true;
}
@@ -466,12 +466,12 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
} else {
struct hl_bd *bd = queue->kernel_address;
- bd += hl_pi_2_offset(queue->pi);
+ bd += hl_pi_2_offset(pi);
dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n"
- "Pkt info: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n",
- tmp, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr,
- queue->dram_bd);
+ "Pkt info[%u]: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n",
+ tmp, pi, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr,
+ queue->dram_bd);
}
hdev->device_cpu_disabled = true;
goto out;
@@ -681,12 +681,10 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
int rc;
memset(&hb_pkt, 0, sizeof(hb_pkt));
- hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
- CPUCP_PKT_CTL_OPCODE_SHIFT);
+ hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << CPUCP_PKT_CTL_OPCODE_SHIFT);
hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
- sizeof(hb_pkt), 0, &result);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, sizeof(hb_pkt), 0, &result);
if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
return -EIO;
@@ -697,6 +695,8 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
rc = -EIO;
}
+ hdev->heartbeat_debug_info.last_pq_heartbeat_ts = ktime_get_real_seconds();
+
return rc;
}
@@ -3196,10 +3196,15 @@ struct hl_reset_info {
/**
* struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
+ * @last_pq_heartbeat_ts: timestamp of the last test packet that was sent to FW.
+ * This packet is the trigger in FW to send the EQ heartbeat event.
+ * @last_eq_heartbeat_ts: timestamp of the last EQ heartbeat event that was received from FW.
* @heartbeat_event_counter: number of heartbeat events received.
* @cpu_queue_id: used to read the queue pi/ci
*/
struct eq_heartbeat_debug_info {
+ time64_t last_pq_heartbeat_ts;
+ time64_t last_eq_heartbeat_ts;
u32 heartbeat_event_counter;
u32 cpu_queue_id;
};