@@ -1380,13 +1380,41 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d
mutex_unlock(fd_lock);
}
+static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
+{
+ /* If reset is due to heartbeat, device CPU is no responsive in
+ * which case no point sending PCI disable message to it.
+ */
+ if ((flags & HL_DRV_RESET_HARD) &&
+ !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
+ /* Disable PCI access from device F/W so he won't send
+ * us additional interrupts. We disable MSI/MSI-X at
+ * the halt_engines function and we can't have the F/W
+ * sending us interrupts after that. We need to disable
+ * the access here because if the device is marked
+ * disable, the message won't be send. Also, in case
+ * of heartbeat, the device CPU is marked as disable
+ * so this message won't be sent
+ */
+ if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
+ dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+ return;
+ }
+
+ /* verify that last EQs are handled before disabled is set */
+ if (hdev->cpu_queues_enable)
+ synchronize_irq(pci_irq_vector(hdev->pdev,
+ hdev->asic_prop.eq_interrupt_id));
+ }
+}
+
static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
{
u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
/* No consecutive mechanism when user context exists */
if (hdev->is_compute_ctx_active)
- goto disable_pci;
+ return;
/*
* 'reset cause' is being updated here, because getting here
@@ -1418,30 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
} else {
hdev->reset_info.reset_trigger_repeated = 1;
}
-
- /* If reset is due to heartbeat, device CPU is no responsive in
- * which case no point sending PCI disable message to it.
- *
- * If F/W is performing the reset, no need to send it a message to disable
- * PCI access
- */
-
-disable_pci:
- if ((flags & HL_DRV_RESET_HARD) &&
- !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
- /* Disable PCI access from device F/W so he won't send
- * us additional interrupts. We disable MSI/MSI-X at
- * the halt_engines function and we can't have the F/W
- * sending us interrupts after that. We need to disable
- * the access here because if the device is marked
- * disable, the message won't be send. Also, in case
- * of heartbeat, the device CPU is marked as disable
- * so this message won't be sent
- */
- if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
- dev_warn(hdev->dev,
- "Failed to disable FW's PCI access\n");
- }
}
/*
@@ -1562,6 +1566,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
escalate_reset_flow:
handle_reset_trigger(hdev, flags);
+ send_disable_pci_access(hdev, flags);
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
@@ -662,6 +662,7 @@ struct hl_hints_range {
* @user_interrupt_count: number of user interrupts.
* @user_dec_intr_count: number of decoder interrupts exposed to user.
* @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
+ * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset.
* @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error.
* @cache_line_size: device cache line size.
* @server_type: Server type that the ASIC is currently installed in.
@@ -793,6 +794,7 @@ struct asic_fixed_properties {
u16 user_interrupt_count;
u16 user_dec_intr_count;
u16 tpc_interrupt_id;
+ u16 eq_interrupt_id;
u16 unexpected_user_error_interrupt_id;
u16 cache_line_size;
u16 server_type;
@@ -682,6 +682,9 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
+ /* single msi */
+ prop->eq_interrupt_id = 0;
+
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;
@@ -2439,6 +2439,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST;
prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT;
+ prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE;
prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR;
prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER;
@@ -473,6 +473,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
prop->first_available_user_interrupt = USHRT_MAX;
prop->tpc_interrupt_id = USHRT_MAX;
+ prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX;
for (i = 0 ; i < HL_MAX_DCORES ; i++)
prop->first_available_cq[i] = USHRT_MAX;