@@ -3689,6 +3689,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg);
irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
irqreturn_t hl_irq_handler_user_interrupt(int irq, void *arg);
irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg);
+irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg);
u32 hl_cq_inc_ptr(u32 ptr);
int hl_asid_init(struct hl_device *hdev);
@@ -401,6 +401,18 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
return IRQ_HANDLED;
}
+irqreturn_t hl_irq_eq_error_interrupt_thread_handler(int irq, void *arg)
+{
+ u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
+ struct hl_device *hdev = arg;
+
+ dev_err(hdev->dev, "EQ error interrupt received\n");
+
+ hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
+
+ return IRQ_HANDLED;
+}
+
/**
* hl_irq_handler_eq - irq handler for event queue
*
@@ -4175,6 +4175,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
return "gaudi2 unexpected error";
case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
return "gaudi2 user completion";
+ case GAUDI2_IRQ_NUM_EQ_ERROR:
+ return "gaudi2 eq error";
default:
return "invalid";
}
@@ -4317,6 +4319,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
}
}
+ irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
+ rc = request_threaded_irq(irq, NULL, hl_irq_eq_error_interrupt_thread_handler,
+ IRQF_ONESHOT, gaudi2_irq_name(GAUDI2_IRQ_NUM_EQ_ERROR),
+ hdev);
+ if (rc) {
+ dev_err(hdev->dev, "Failed to request IRQ %d", irq);
+ goto free_user_irq;
+ }
+
gaudi2->hw_cap_initialized |= HW_CAP_MSIX;
return 0;
@@ -4376,6 +4387,7 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
}
synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE));
+ synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR));
}
static void gaudi2_disable_msix(struct hl_device *hdev)
@@ -4412,6 +4424,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
cq = &hdev->completion_queue[GAUDI2_RESERVED_CQ_CS_COMPLETION];
free_irq(irq, cq);
+ irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EQ_ERROR);
+ free_irq(irq, hdev);
+
pci_free_irq_vectors(hdev->pdev);
gaudi2->hw_cap_initialized &= ~HW_CAP_MSIX;
@@ -11345,6 +11360,7 @@ static int gaudi2_ack_mmu_page_fault_or_access_error(struct hl_device *hdev, u64
static void gaudi2_get_msi_info(__le32 *table)
{
table[CPUCP_EVENT_QUEUE_MSI_TYPE] = cpu_to_le32(GAUDI2_EVENT_QUEUE_MSIX_IDX);
+ table[CPUCP_EVENT_QUEUE_ERR_MSI_TYPE] = cpu_to_le32(GAUDI2_IRQ_NUM_EQ_ERROR);
}
static int gaudi2_map_pll_idx_to_fw_idx(u32 pll_idx)
@@ -419,6 +419,7 @@ enum gaudi2_irq_num {
GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
GAUDI2_IRQ_NUM_TPC_ASSERT,
+ GAUDI2_IRQ_NUM_EQ_ERROR,
GAUDI2_IRQ_NUM_RESERVED_FIRST,
GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
@@ -1004,6 +1004,7 @@ enum cpucp_msi_type {
CPUCP_NIC_PORT5_MSI_TYPE,
CPUCP_NIC_PORT7_MSI_TYPE,
CPUCP_NIC_PORT9_MSI_TYPE,
+ CPUCP_EVENT_QUEUE_ERR_MSI_TYPE,
CPUCP_NUM_OF_MSI_TYPES
};