diff mbox series

[5/5] drm/msm: Add debugfs to disable hw err handling

Message ID 20211109181117.591148-6-robdclark@gmail.com (mailing list archive)
State New, archived
Headers show
Series drm/msm: Cleanup and drm/sched tdr prep | expand

Commit Message

Rob Clark Nov. 9, 2021, 6:11 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

Add a debugfs interface to ignore hw error irqs, in order to force
fallback to sw hangcheck mechanism.  Because the hw error detection is
pretty good on newer gens, we need this for igt tests to test the sw
hang detection.

Signed-off-by: Rob Clark <robdclark@chromium.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 ++++++
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++
 drivers/gpu/drm/msm/msm_debugfs.c     | 3 +++
 drivers/gpu/drm/msm/msm_drv.h         | 9 +++++++++
 4 files changed, 22 insertions(+)

Comments

Akhil P Oommen Nov. 11, 2021, 5:14 p.m. UTC | #1
On 11/9/2021 11:41 PM, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> Add a debugfs interface to ignore hw error irqs, in order to force
> fallback to sw hangcheck mechanism.  Because the hw error detection is
> pretty good on newer gens, we need this for igt tests to test the sw
> hang detection.
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> ---
>   drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 6 ++++++
>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 ++++
>   drivers/gpu/drm/msm/msm_debugfs.c     | 3 +++
>   drivers/gpu/drm/msm/msm_drv.h         | 9 +++++++++
>   4 files changed, 22 insertions(+)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index 6163990a4d09..ec8e043c9d38 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1252,6 +1252,7 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
>   
>   static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
>   {
> +	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
>   
>   	/*
> @@ -1261,6 +1262,11 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
>   	gpu_write(gpu, REG_A5XX_RBBM_INT_CLEAR_CMD,
>   		status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR);
>   
> +	if (priv->disable_err_irq) {
> +		status &= A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS |
> +			  A5XX_RBBM_INT_0_MASK_CP_SW;
> +	}
> +
>   	/* Pass status to a5xx_rbbm_err_irq because we've already cleared it */
>   	if (status & RBBM_ERROR_MASK)
>   		a5xx_rbbm_err_irq(gpu, status);
> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> index 3d2da81cb2c9..8a2af3a27e33 100644
> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
> @@ -1373,10 +1373,14 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu)
>   
>   static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>   {
> +	struct msm_drm_private *priv = gpu->dev->dev_private;
>   	u32 status = gpu_read(gpu, REG_A6XX_RBBM_INT_0_STATUS);
>   
>   	gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status);
>   
> +	if (priv->disable_err_irq)
> +		status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS;
> +
>   	if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
>   		a6xx_fault_detect_irq(gpu);
>   
> diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
> index 6a99e8b5d25d..956b1efc3721 100644
> --- a/drivers/gpu/drm/msm/msm_debugfs.c
> +++ b/drivers/gpu/drm/msm/msm_debugfs.c
> @@ -242,6 +242,9 @@ void msm_debugfs_init(struct drm_minor *minor)
>   	debugfs_create_u32("hangcheck_period_ms", 0600, minor->debugfs_root,
>   		&priv->hangcheck_period);
>   
> +	debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
> +		&priv->disable_err_irq);
> +
>   	debugfs_create_file("shrink", S_IRWXU, minor->debugfs_root,
>   		dev, &shrink_fops);
>   
> diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
> index 2943c21d9aac..a8da7a7efb84 100644
> --- a/drivers/gpu/drm/msm/msm_drv.h
> +++ b/drivers/gpu/drm/msm/msm_drv.h
> @@ -246,6 +246,15 @@ struct msm_drm_private {
>   
>   	/* For hang detection, in ms */
>   	unsigned int hangcheck_period;
> +
> +	/**
> +	 * disable_err_irq:
> +	 *
> +	 * Disable handling of GPU hw error interrupts, to force fallback to
> +	 * sw hangcheck timer.  Written (via debugfs) by igt tests to test
> +	 * the sw hangcheck mechanism.
> +	 */
> +	bool disable_err_irq;
>   };
>   
>   struct msm_format {
> 

Reviewed-by: Akhil P Oommen <akhilpo@codeaurora.org>

-Akhil.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 6163990a4d09..ec8e043c9d38 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1252,6 +1252,7 @@  static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 
 static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 {
+	struct msm_drm_private *priv = gpu->dev->dev_private;
 	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
 
 	/*
@@ -1261,6 +1262,11 @@  static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 	gpu_write(gpu, REG_A5XX_RBBM_INT_CLEAR_CMD,
 		status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR);
 
+	if (priv->disable_err_irq) {
+		status &= A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS |
+			  A5XX_RBBM_INT_0_MASK_CP_SW;
+	}
+
 	/* Pass status to a5xx_rbbm_err_irq because we've already cleared it */
 	if (status & RBBM_ERROR_MASK)
 		a5xx_rbbm_err_irq(gpu, status);
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 3d2da81cb2c9..8a2af3a27e33 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1373,10 +1373,14 @@  static void a6xx_fault_detect_irq(struct msm_gpu *gpu)
 
 static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
 {
+	struct msm_drm_private *priv = gpu->dev->dev_private;
 	u32 status = gpu_read(gpu, REG_A6XX_RBBM_INT_0_STATUS);
 
 	gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status);
 
+	if (priv->disable_err_irq)
+		status &= A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS;
+
 	if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
 		a6xx_fault_detect_irq(gpu);
 
diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
index 6a99e8b5d25d..956b1efc3721 100644
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@@ -242,6 +242,9 @@  void msm_debugfs_init(struct drm_minor *minor)
 	debugfs_create_u32("hangcheck_period_ms", 0600, minor->debugfs_root,
 		&priv->hangcheck_period);
 
+	debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root,
+		&priv->disable_err_irq);
+
 	debugfs_create_file("shrink", S_IRWXU, minor->debugfs_root,
 		dev, &shrink_fops);
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 2943c21d9aac..a8da7a7efb84 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -246,6 +246,15 @@  struct msm_drm_private {
 
 	/* For hang detection, in ms */
 	unsigned int hangcheck_period;
+
+	/**
+	 * disable_err_irq:
+	 *
+	 * Disable handling of GPU hw error interrupts, to force fallback to
+	 * sw hangcheck timer.  Written (via debugfs) by igt tests to test
+	 * the sw hangcheck mechanism.
+	 */
+	bool disable_err_irq;
 };
 
 struct msm_format {