[27/37] drm/i915/dg1: Log counter on SLM ECC error
diff mbox series

Message ID 20200521003803.18936-28-lucas.demarchi@intel.com
State New
Headers show
Series
  • Introduce DG1
Related show

Commit Message

Lucas De Marchi May 21, 2020, 12:37 a.m. UTC
From: Fernando Pacheco <fernando.pacheco@intel.com>

Correctable and uncorrectable Shared Local Memory (SLM)
ECC errors will be counted in two different Thread Dispatch
Logic (TDL) registers. GuC will receive a message
from TDL when the first correctable/uncorrectable error is
detected by SLM (first after a reset or register clear). This
message is then forwarded to the appropriate severity register.

Correctable errors will route to kernel driver and uncorrectable errors
are expected to route as PCIe Error. Although the option exists to route
both as interrupts.

Service the interrupt and read TDL registers for error count.

Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Fernando Pacheco <fernando.pacheco@intel.com>
Cc: Radhakrishna Sripada <radhakrishna.sripada@intel.com>
Signed-off-by: Fernando Pacheco <fernando.pacheco@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 10 +++++++++-
 drivers/gpu/drm/i915/i915_reg.h |  7 +++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 17e679b910da..ca35edef492d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2536,7 +2536,7 @@  gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
 {
 	void __iomem * const regs = i915->uncore.regs;
 	const char *hw_err_str = hardware_error_type_to_str(hw_err);
-	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
+	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR | SLM_ERROR);
 	u32 errstat;
 
 	lockdep_assert_held(&i915->irq_lock);
@@ -2565,6 +2565,14 @@  gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
 	if (errstat & EU_IC_ERROR)
 		DRM_ERROR("detected EU IC %s hardware error\n", hw_err_str);
 
+	if (errstat & SLM_ERROR) {
+		struct drm_i915_private *dev_priv = i915;
+
+		DRM_ERROR("detected %u SLM %s hardware error(s)\n",
+			  I915_READ(SLM_ECC_ERROR_CNTR(hw_err)),
+			  hw_err_str);
+	}
+
 	/*
 	 * TODO: The remaining GT errors don't have a
 	 * need for targeted logging at the moment. We
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 40cb361b4254..b9c142f86611 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7765,6 +7765,13 @@  enum hardware_error {
 						_ERR_STAT_GT_NONFATAL))
 #define  EU_GRF_ERROR			(1 << 15)
 #define  EU_IC_ERROR			(1 << 14)
+#define  SLM_ERROR			(1 << 13)
+
+#define _SLM_ECC_ERROR_CNT		0xe7f4
+#define _SLM_UNCORR_ECC_ERROR_CNT	0xe7c0
+#define SLM_ECC_ERROR_CNTR(x)		_MMIO((x) == HARDWARE_ERROR_CORRECTABLE ? \
+						_SLM_ECC_ERROR_CNT : \
+						_SLM_UNCORR_ECC_ERROR_CNT)
 
 #define GEN11_RENDER_COPY_INTR_ENABLE	_MMIO(0x190030)
 #define GEN11_VCS_VECS_INTR_ENABLE	_MMIO(0x190034)