@@ -2515,6 +2515,124 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
return IRQ_HANDLED;
}
+static const char *
+hardware_error_type_to_str(const enum hardware_error hw_err)
+{
+ switch (hw_err) {
+ case HARDWARE_ERROR_CORRECTABLE:
+ return "CORRECTABLE";
+ case HARDWARE_ERROR_NONFATAL:
+ return "NONFATAL";
+ case HARDWARE_ERROR_FATAL:
+ return "FATAL";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void
+gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
+ const enum hardware_error hw_err)
+{
+ void __iomem * const regs = i915->uncore.regs;
+ const char *hw_err_str = hardware_error_type_to_str(hw_err);
+ u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
+ u32 errstat;
+
+ lockdep_assert_held(&i915->irq_lock);
+
+ errstat = raw_reg_read(regs, ERR_STAT_GT_REG(hw_err));
+
+ if (unlikely(!errstat)) {
+ DRM_ERROR("ERR_STAT_GT_REG_%s blank!\n", hw_err_str);
+ return;
+ }
+
+ /*
+ * TODO: The GT Non Fatal Error Status Register
+ * only has reserved bitfields defined.
+ * Remove once there is something to service.
+ */
+ if (hw_err == HARDWARE_ERROR_NONFATAL) {
+ DRM_ERROR("detected Non-Fatal hardware error\n");
+ raw_reg_write(regs, ERR_STAT_GT_REG(hw_err), errstat);
+ return;
+ }
+
+ if (errstat & EU_GRF_ERROR)
+ DRM_ERROR("detected EU GRF %s hardware error\n", hw_err_str);
+
+ if (errstat & EU_IC_ERROR)
+ DRM_ERROR("detected EU IC %s hardware error\n", hw_err_str);
+
+ /*
+ * TODO: The remaining GT errors don't have a
+ * need for targeted logging at the moment. We
+ * still want to log detection of these errors, but
+ * let's aggregate them until someone has a need for them.
+ */
+ if (errstat & other_errors)
+ DRM_ERROR("detected hardware error(s) in ERR_STAT_GT_REG_%s: 0x%08x\n",
+ hw_err_str, errstat & other_errors);
+
+ raw_reg_write(regs, ERR_STAT_GT_REG(hw_err), errstat);
+}
+
+static void
+gen12_hw_error_source_handler(struct drm_i915_private * const i915,
+ const enum hardware_error hw_err)
+{
+ void __iomem * const regs = i915->uncore.regs;
+ const char *hw_err_str = hardware_error_type_to_str(hw_err);
+ u32 errsrc;
+
+ spin_lock(&i915->irq_lock);
+ errsrc = raw_reg_read(regs, DEV_ERR_STAT_REG(hw_err));
+
+ if (unlikely(!errsrc)) {
+ DRM_ERROR("DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
+ goto out_unlock;
+ }
+
+ if (errsrc & DEV_ERR_STAT_GT_ERROR)
+ gen12_gt_hw_error_handler(i915, hw_err);
+
+ if (errsrc & ~DEV_ERR_STAT_GT_ERROR)
+ DRM_ERROR("non-GT hardware error(s) in DEV_ERR_STAT_REG_%s: 0x%08x\n",
+ hw_err_str, errsrc & ~DEV_ERR_STAT_GT_ERROR);
+
+ raw_reg_write(regs, DEV_ERR_STAT_REG(hw_err), errsrc);
+
+out_unlock:
+ spin_unlock(&i915->irq_lock);
+}
+
+/*
+ * GEN12+ adds three Error bits to the Master Interrupt
+ * Register to support dgfx card error handling.
+ * These three bits are used to convey the class of error:
+ * FATAL, NONFATAL, or CORRECTABLE.
+ *
+ * To process an interrupt:
+ * 1. Determine source of error (IP block) by reading
+ * the Device Error Source Register (RW1C) that
+ * corresponds to the class of error being serviced.
+ * 2. For GT as the generating IP block, read and log
+ * the GT Error Register (RW1C) that corresponds to
+ * the class of error being serviced.
+ */
+static void
+gen12_hw_error_irq_handler(struct drm_i915_private * const i915,
+ const u32 master_ctl)
+{
+ enum hardware_error hw_err;
+
+ for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
+ if (master_ctl & GEN12_ERROR_IRQ(hw_err))
+ gen12_hw_error_source_handler(i915, hw_err);
+ }
+}
+
static u32
gen11_gu_misc_irq_ack(struct intel_gt *gt, const u32 master_ctl)
{
@@ -2597,6 +2715,9 @@ __gen11_irq_handler(struct drm_i915_private * const i915,
/* Find, queue (onto bottom-halves), then clear each source */
gen11_gt_irq_handler(gt, master_ctl);
+ if (IS_DG1(i915))
+ gen12_hw_error_irq_handler(i915, master_ctl);
+
/* IRQs are synced during runtime_suspend, we don't require a wakeref */
if (master_ctl & GEN11_DISPLAY_IRQ)
gen11_display_irq_handler(i915);
@@ -7647,6 +7647,10 @@ enum {
#define GEN11_MASTER_IRQ (1 << 31)
#define GEN11_PCU_IRQ (1 << 30)
#define GEN11_GU_MISC_IRQ (1 << 29)
+#define GEN12_FATAL_ERROR_IRQ (1 << 28)
+#define GEN12_NON_FATAL_ERROR_IRQ (1 << 27)
+#define GEN12_CORRECTABLE_ERROR_IRQ (1 << 26)
+#define GEN12_ERROR_IRQ(x) (1 << (26 + (x)))
#define GEN11_DISPLAY_IRQ (1 << 16)
#define GEN11_GT_DW_IRQ(x) (1 << (x))
#define GEN11_GT_DW1_IRQ (1 << 1)
@@ -7738,6 +7742,30 @@ enum {
#define GEN11_IIR_REG_SELECTOR(x) _MMIO(0x190070 + ((x) * 4))
+enum hardware_error {
+ HARDWARE_ERROR_CORRECTABLE = 0,
+ HARDWARE_ERROR_NONFATAL = 1,
+ HARDWARE_ERROR_FATAL = 2,
+ HARDWARE_ERROR_MAX,
+};
+
+#define _DEV_ERR_STAT_FATAL 0x100174
+#define _DEV_ERR_STAT_NONFATAL 0x100178
+#define _DEV_ERR_STAT_CORRECTABLE 0x10017c
+#define DEV_ERR_STAT_REG(x) _MMIO(_PICK_EVEN((x), \
+ _DEV_ERR_STAT_CORRECTABLE, \
+ _DEV_ERR_STAT_NONFATAL))
+#define DEV_ERR_STAT_GT_ERROR (1 << 0)
+
+#define _ERR_STAT_GT_COR 0x100160
+#define _ERR_STAT_GT_NONFATAL 0x100164
+#define _ERR_STAT_GT_FATAL 0x100168
+#define ERR_STAT_GT_REG(x) _MMIO(_PICK_EVEN((x), \
+ _ERR_STAT_GT_COR, \
+ _ERR_STAT_GT_NONFATAL))
+#define EU_GRF_ERROR (1 << 15)
+#define EU_IC_ERROR (1 << 14)
+
#define GEN11_RENDER_COPY_INTR_ENABLE _MMIO(0x190030)
#define GEN11_VCS_VECS_INTR_ENABLE _MMIO(0x190034)
#define GEN11_GUC_SG_INTR_ENABLE _MMIO(0x190038)