@@ -998,6 +998,8 @@ static void hl_device_heartbeat(struct work_struct *work)
{
struct hl_device *hdev = container_of(work, struct hl_device,
work_heartbeat.work);
+ struct hl_info_fw_err_info info = {0};
+ u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
if (!hl_device_operational(hdev, NULL))
goto reschedule;
@@ -1008,7 +1010,10 @@ static void hl_device_heartbeat(struct work_struct *work)
if (hl_device_operational(hdev, NULL))
dev_err(hdev->dev, "Device heartbeat failed!\n");
- hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
+ info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
+ info.event_mask = &event_mask;
+ hl_handle_fw_err(hdev, &info);
+ hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask);
return;
@@ -2626,3 +2631,49 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
if (event_mask)
*event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT;
}
+
+void hl_capture_hw_err(struct hl_device *hdev, u16 event_id)
+{
+ struct hw_err_info *info = &hdev->captured_err_info.hw_err;
+
+ /* Capture only the first HW err */
+ if (atomic_cmpxchg(&info->event_detected, 0, 1))
+ return;
+
+ info->event.timestamp = ktime_to_ns(ktime_get());
+ info->event.event_id = event_id;
+
+ info->event_info_available = true;
+}
+
+void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask)
+{
+ hl_capture_hw_err(hdev, event_id);
+
+ if (event_mask)
+ *event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR;
+}
+
+void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info)
+{
+ struct fw_err_info *info = &hdev->captured_err_info.fw_err;
+
+ /* Capture only the first FW error */
+ if (atomic_cmpxchg(&info->event_detected, 0, 1))
+ return;
+
+ info->event.timestamp = ktime_to_ns(ktime_get());
+ info->event.err_type = fw_info->err_type;
+ if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
+ info->event.event_id = fw_info->event_id;
+
+ info->event_info_available = true;
+}
+
+void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
+{
+ hl_capture_fw_err(hdev, info);
+
+ if (info->event_mask)
+ *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
+}
@@ -3031,18 +3031,56 @@ struct razwi_info {
bool razwi_info_available;
};
+/**
+ * struct hw_err_info - HW error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then a HW event was discovered for the
+ * first time after the driver has finished booting-up.
+ * currently we assume that only fatal events (that require hard-reset) are
+ * reported so we don't care of the others that might follow it.
+ * so once changed to 1, it will remain that way.
+ * TODO: support multiple events.
+ * @event_info_available: indicates that a HW event info is now available.
+ */
+struct hw_err_info {
+ struct hl_info_hw_err_event event;
+ atomic_t event_detected;
+ bool event_info_available;
+};
+
+/**
+ * struct fw_err_info - FW error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then a FW event was discovered for the
+ * first time after the driver has finished booting-up.
+ * currently we assume that only fatal events (that require hard-reset) are
+ * reported so we don't care of the others that might follow it.
+ * so once changed to 1, it will remain that way.
+ * TODO: support multiple events.
+ * @event_info_available: indicates that a HW event info is now available.
+ */
+struct fw_err_info {
+ struct hl_info_fw_err_event event;
+ atomic_t event_detected;
+ bool event_info_available;
+};
+
/**
* struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information.
* @razwi_info: RAZWI information.
* @undef_opcode: undefined opcode information.
* @page_fault_info: page fault information.
+ * @hw_err: (fatal) hardware error information.
+ * @fw_err: firmware error information.
*/
struct hl_error_info {
struct cs_timeout_info cs_timeout;
struct razwi_info razwi_info;
struct undefined_opcode_info undef_opcode;
struct page_fault_info page_fault_info;
+ struct hw_err_info hw_err;
+ struct fw_err_info fw_err;
};
/**
@@ -3453,6 +3491,20 @@ struct hl_cs_encaps_sig_handle {
u32 count;
};
+/**
+ * struct hl_info_fw_err_info - firmware error information structure
+ * @err_type: The type of error detected (or reported).
+ * @event_mask: Pointer to the event mask to be modified with the detected error flag
+ * (can be NULL)
+ * @event_id: The id of the event that reported the error
+ * (applicable when err_type is HL_INFO_FW_REPORTED_ERR).
+ */
+struct hl_info_fw_err_info {
+ enum hl_info_fw_err_type err_type;
+ u64 *event_mask;
+ u16 event_id;
+};
+
/*
* IOCTLs
*/
@@ -3883,6 +3935,8 @@ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_o
void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
u64 *event_mask);
+void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
+void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
#ifdef CONFIG_DEBUG_FS
@@ -221,12 +221,9 @@ int hl_device_open(struct inode *inode, struct file *filp)
hl_debugfs_add_file(hpriv);
+ memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
- atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
- atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
hdev->captured_err_info.undef_opcode.write_enable = true;
- hdev->captured_err_info.razwi_info.razwi_info_available = false;
- hdev->captured_err_info.page_fault_info.page_fault_info_available = false;
hdev->open_counter++;
hdev->last_successful_open_jif = jiffies;
@@ -830,6 +830,50 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0;
}
+static int hw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+ struct hl_device *hdev = hpriv->hdev;
+ u32 user_buf_size = args->return_size;
+ struct hw_err_info *info;
+ int rc;
+
+ if ((!user_buf_size) || (!user_buf))
+ return -EINVAL;
+
+ if (user_buf_size < sizeof(struct hl_info_hw_err_event))
+ return -ENOMEM;
+
+ info = &hdev->captured_err_info.hw_err;
+ if (!info->event_info_available)
+ return -ENOENT;
+
+ rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_hw_err_event));
+ return rc ? -EFAULT : 0;
+}
+
+static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+ struct hl_device *hdev = hpriv->hdev;
+ u32 user_buf_size = args->return_size;
+ struct fw_err_info *info;
+ int rc;
+
+ if ((!user_buf_size) || (!user_buf))
+ return -EINVAL;
+
+ if (user_buf_size < sizeof(struct hl_info_fw_err_event))
+ return -ENOMEM;
+
+ info = &hdev->captured_err_info.fw_err;
+ if (!info->event_info_available)
+ return -ENOENT;
+
+ rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_fw_err_event));
+ return rc ? -EFAULT : 0;
+}
+
static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
{
void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@@ -950,6 +994,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_UNREGISTER_EVENTFD:
return eventfd_unregister(hpriv, args);
+ case HL_INFO_HW_ERR_EVENT:
+ return hw_err_info(hpriv, args);
+
+ case HL_INFO_FW_ERR_EVENT:
+ return fw_err_info(hpriv, args);
+
default:
break;
}
@@ -7634,6 +7634,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type,
static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
{
struct gaudi_device *gaudi = hdev->asic_specific;
+ struct hl_info_fw_err_info fw_err_info;
u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0;
u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
u32 fw_fatal_err_flag = 0, flags = 0;
@@ -7912,7 +7913,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_FW_ALIVE_S:
gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
- event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+ fw_err_info.err_type = HL_INFO_FW_REPORTED_ERR;
+ fw_err_info.event_id = event_type;
+ fw_err_info.event_mask = &event_mask;
+ hl_handle_fw_err(hdev, &fw_err_info);
goto reset_device;
default:
@@ -7943,6 +7947,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
}
if (reset_required) {
+ /* escalate general hw errors to critical/fatal error */
+ if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
+ hl_handle_critical_hw_err(hdev, event_type, &event_mask);
+
hl_device_cond_reset(hdev, flags, event_mask);
} else {
hl_fw_unmask_irq(hdev, event_type);
@@ -9444,6 +9444,10 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
} else {
reset_flags |= HL_DRV_RESET_DELAY;
}
+ /* escalate general hw errors to critical/fatal error */
+ if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
+ hl_handle_critical_hw_err(hdev, event_type, &event_mask);
+
event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
hl_device_cond_reset(hdev, reset_flags, event_mask);
}
@@ -723,6 +723,10 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
+ * HL_NOTIFIER_EVENT_CRITICAL_HW_ERR - Indicates a HW error that requires SW abort and
+ * HW reset
+ * HL_NOTIFIER_EVENT_CRITICAL_FW_ERR - Indicates a FW error that requires SW abort and
+ * HW reset
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@@ -733,6 +737,8 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
+#define HL_NOTIFIER_EVENT_CRITICL_HW_ERR (1ULL << 9)
+#define HL_NOTIFIER_EVENT_CRITICL_FW_ERR (1ULL << 10)
/* Opcode for management ioctl
*
@@ -790,6 +796,8 @@ enum hl_server_type {
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
* HL_INFO_FW_GENERIC_REQ - Send generic request to FW.
+ * HL_INFO_HW_ERR_EVENT - Retrieve information on the reported HW error.
+ * HL_INFO_FW_ERR_EVENT - Retrieve information on the reported FW error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -824,6 +832,8 @@ enum hl_server_type {
#define HL_INFO_PAGE_FAULT_EVENT 33
#define HL_INFO_USER_MAPPINGS 34
#define HL_INFO_FW_GENERIC_REQ 35
+#define HL_INFO_HW_ERR_EVENT 36
+#define HL_INFO_FW_ERR_EVENT 37
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -1161,6 +1171,39 @@ struct hl_info_undefined_opcode_event {
__u32 stream_id;
};
+/**
+ * struct hl_info_hw_err_event - info about HW error
+ * @timestamp: timestamp of error occurrence
+ * @event_id: The async event ID (specific to each device type).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_hw_err_event {
+ __s64 timestamp;
+ __u16 event_id;
+ __u16 pad[3];
+};
+
+/* FW error definition for event_type in struct hl_info_fw_err_event */
+enum hl_info_fw_err_type {
+ HL_INFO_FW_HEARTBEAT_ERR,
+ HL_INFO_FW_REPORTED_ERR,
+};
+
+/**
+ * struct hl_info_fw_err_event - info about FW error
+ * @timestamp: time-stamp of error occurrence
+ * @err_type: The type of event as defined in hl_info_fw_err_type.
+ * @event_id: The async event ID (specific to each device type, applicable only when event type is
+ * HL_INFO_FW_REPORTED_ERR).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_fw_err_event {
+ __s64 timestamp;
+ __u16 err_type;
+ __u16 event_id;
+ __u32 pad;
+};
+
/**
* struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
* @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size