diff mbox

drm/i915: Stop gathering error states for CS error interrupts

Message ID 1415112742-24955-1-git-send-email-daniel.vetter@ffwll.ch (mailing list archive)
State New, archived
Headers show

Commit Message

Daniel Vetter Nov. 4, 2014, 2:52 p.m. UTC
There's quite a few bug reports with error states where the error
reasons makes just about no sense at all. Like dying on tlbs for a
display plane that's not even there. Also users don't really report a
lot of bad side effects generally, just the error states.

Furthermore we don't even enable these interrupts any more on gen5+
(though the handling code is still there). So this mostly concerns old
platforms.

Given all that lets make our lives a bit easier and stop capturing
error states, in the hopes that we can just ignore them. In case
that's not true and the gpu indeed dies the hangcheck should
eventually kick in. And I've left some debug log in to make this case
noticeble. Referenced bug is just an example.

References: https://bugs.freedesktop.org/show_bug.cgi?id=82095
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

Comments

Jani Nikula Nov. 4, 2014, 3:02 p.m. UTC | #1
On Tue, 04 Nov 2014, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> There's quite a few bug reports with error states where the error
> reasons makes just about no sense at all. Like dying on tlbs for a
> display plane that's not even there. Also users don't really report a
> lot of bad side effects generally, just the error states.
>
> Furthermore we don't even enable these interrupts any more on gen5+
> (though the handling code is still there). So this mostly concerns old
> platforms.
>
> Given all that lets make our lives a bit easier and stop capturing
> error states, in the hopes that we can just ignore them. In case
> that's not true and the gpu indeed dies the hangcheck should
> eventually kick in. And I've left some debug log in to make this case
> noticeble. Referenced bug is just an example.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=82095
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 25 +++++++------------------
>  1 file changed, 7 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 318a6a0724d0..2f78764cb215 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1319,10 +1319,8 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>  
>  	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
>  		      GT_BSD_CS_ERROR_INTERRUPT |
> -		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
> -		i915_handle_error(dev, false, "GT error interrupt 0x%08x",
> -				  gt_iir);
> -	}
> +		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
> +		DRM_DEBUG("Command parser error, gt_iir 0x%08x", gt_iir);

\n missing all around.

BR,
Jani.

>  
>  	if (gt_iir & GT_PARITY_ERROR(dev))
>  		ivybridge_parity_error_irq_handler(dev, gt_iir);
> @@ -1715,11 +1713,8 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
>  		if (pm_iir & PM_VEBOX_USER_INTERRUPT)
>  			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
>  
> -		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
> -			i915_handle_error(dev_priv->dev, false,
> -					  "VEBOX CS error interrupt 0x%08x",
> -					  pm_iir);
> -		}
> +		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT)
> +			DRM_DEBUG("Command parser error, pm_iir 0x%08x", pm_iir);
>  	}
>  }
>  
> @@ -3744,9 +3739,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock(&dev_priv->irq_lock);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false,
> -					  "Command parser error, iir 0x%08x",
> -					  iir);
> +			DRM_DEBUG("Command parser error, iir 0x%08x", iir);
>  
>  		for_each_pipe(dev_priv, pipe) {
>  			int reg = PIPESTAT(pipe);
> @@ -3929,9 +3922,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock(&dev_priv->irq_lock);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false,
> -					  "Command parser error, iir 0x%08x",
> -					  iir);
> +			DRM_DEBUG("Command parser error, iir 0x%08x", iir);
>  
>  		for_each_pipe(dev_priv, pipe) {
>  			int reg = PIPESTAT(pipe);
> @@ -4156,9 +4147,7 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
>  		 */
>  		spin_lock(&dev_priv->irq_lock);
>  		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> -			i915_handle_error(dev, false,
> -					  "Command parser error, iir 0x%08x",
> -					  iir);
> +			DRM_DEBUG("Command parser error, iir 0x%08x", iir);
>  
>  		for_each_pipe(dev_priv, pipe) {
>  			int reg = PIPESTAT(pipe);
> -- 
> 2.1.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Nov. 5, 2014, 8:35 a.m. UTC | #2
On Tue, Nov 04, 2014 at 03:52:22PM +0100, Daniel Vetter wrote:
> There's quite a few bug reports with error states where the error
> reasons makes just about no sense at all. Like dying on tlbs for a
> display plane that's not even there. Also users don't really report a
> lot of bad side effects generally, just the error states.
> 
> Furthermore we don't even enable these interrupts any more on gen5+
> (though the handling code is still there). So this mostly concerns old
> platforms.
> 
> Given all that lets make our lives a bit easier and stop capturing
> error states, in the hopes that we can just ignore them. In case
> that's not true and the gpu indeed dies the hangcheck should
> eventually kick in. And I've left some debug log in to make this case
> noticeble. Referenced bug is just an example.

The problem is they can be useful. They have shown when our modesetting
sequence has been completely snafu, and they can also be used to detect
page faults (but that does require a bit of kernel trickery) in
userspace GPU command streams. Even in the Display B on 845g, we must
have done something to upset the hardware, but we simply haven't
captured what. I am not yet convinced we want to throw all such reports
away, in case we do ignore genuine fail.

How about just toning down the error message for non-fatal faults, and
discarding the earlier error state should we get a fatal fault afterwards?
-Chris
Daniel Vetter Nov. 5, 2014, 9:56 a.m. UTC | #3
On Wed, Nov 05, 2014 at 08:35:01AM +0000, Chris Wilson wrote:
> On Tue, Nov 04, 2014 at 03:52:22PM +0100, Daniel Vetter wrote:
> > There's quite a few bug reports with error states where the error
> > reasons makes just about no sense at all. Like dying on tlbs for a
> > display plane that's not even there. Also users don't really report a
> > lot of bad side effects generally, just the error states.
> > 
> > Furthermore we don't even enable these interrupts any more on gen5+
> > (though the handling code is still there). So this mostly concerns old
> > platforms.
> > 
> > Given all that lets make our lives a bit easier and stop capturing
> > error states, in the hopes that we can just ignore them. In case
> > that's not true and the gpu indeed dies the hangcheck should
> > eventually kick in. And I've left some debug log in to make this case
> > noticeble. Referenced bug is just an example.
> 
> The problem is they can be useful. They have shown when our modesetting
> sequence has been completely snafu, and they can also be used to detect
> page faults (but that does require a bit of kernel trickery) in
> userspace GPU command streams. Even in the Display B on 845g, we must
> have done something to upset the hardware, but we simply haven't
> captured what. I am not yet convinced we want to throw all such reports
> away, in case we do ignore genuine fail.
> 
> How about just toning down the error message for non-fatal faults, and
> discarding the earlier error state should we get a fatal fault afterwards?

Hm yeah, that might work too.
-Daniel
Daniel Vetter Nov. 24, 2014, 8:57 p.m. UTC | #4
On Wed, Nov 05, 2014 at 10:56:06AM +0100, Daniel Vetter wrote:
> On Wed, Nov 05, 2014 at 08:35:01AM +0000, Chris Wilson wrote:
> > On Tue, Nov 04, 2014 at 03:52:22PM +0100, Daniel Vetter wrote:
> > > There's quite a few bug reports with error states where the error
> > > reasons makes just about no sense at all. Like dying on tlbs for a
> > > display plane that's not even there. Also users don't really report a
> > > lot of bad side effects generally, just the error states.
> > > 
> > > Furthermore we don't even enable these interrupts any more on gen5+
> > > (though the handling code is still there). So this mostly concerns old
> > > platforms.
> > > 
> > > Given all that lets make our lives a bit easier and stop capturing
> > > error states, in the hopes that we can just ignore them. In case
> > > that's not true and the gpu indeed dies the hangcheck should
> > > eventually kick in. And I've left some debug log in to make this case
> > > noticeble. Referenced bug is just an example.
> > 
> > The problem is they can be useful. They have shown when our modesetting
> > sequence has been completely snafu, and they can also be used to detect
> > page faults (but that does require a bit of kernel trickery) in
> > userspace GPU command streams. Even in the Display B on 845g, we must
> > have done something to upset the hardware, but we simply haven't
> > captured what. I am not yet convinced we want to throw all such reports
> > away, in case we do ignore genuine fail.
> > 
> > How about just toning down the error message for non-fatal faults, and
> > discarding the earlier error state should we get a fatal fault afterwards?
> 
> Hm yeah, that might work too.

I looked at this and it gets ugly fast. Given that we seem to have a quite
substantial false-positive (found one more by just reading recent bug
spam) rate and haven't enabled this on gen5+ I've decided to just merge
this one here. With the missing \n added ofc.

We can still inject manual captures using debugfs, and wiring this up
again if it indeed proves useful should be quit.
-Daniel
Chris Wilson Nov. 24, 2014, 9:42 p.m. UTC | #5
On Mon, Nov 24, 2014 at 09:57:32PM +0100, Daniel Vetter wrote:
> I looked at this and it gets ugly fast. Given that we seem to have a quite
> substantial false-positive (found one more by just reading recent bug
> spam) rate and haven't enabled this on gen5+ I've decided to just merge
> this one here. With the missing \n added ofc.

I am not 100% convinced they are false-positives, I just haven't found
the bug. In the past, they have been very reliable for detecting real
bugs.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 318a6a0724d0..2f78764cb215 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1319,10 +1319,8 @@  static void snb_gt_irq_handler(struct drm_device *dev,
 
 	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
 		      GT_BSD_CS_ERROR_INTERRUPT |
-		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
-		i915_handle_error(dev, false, "GT error interrupt 0x%08x",
-				  gt_iir);
-	}
+		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT))
+		DRM_DEBUG("Command parser error, gt_iir 0x%08x", gt_iir);
 
 	if (gt_iir & GT_PARITY_ERROR(dev))
 		ivybridge_parity_error_irq_handler(dev, gt_iir);
@@ -1715,11 +1713,8 @@  static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 		if (pm_iir & PM_VEBOX_USER_INTERRUPT)
 			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
 
-		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
-			i915_handle_error(dev_priv->dev, false,
-					  "VEBOX CS error interrupt 0x%08x",
-					  pm_iir);
-		}
+		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT)
+			DRM_DEBUG("Command parser error, pm_iir 0x%08x", pm_iir);
 	}
 }
 
@@ -3744,9 +3739,7 @@  static irqreturn_t i8xx_irq_handler(int irq, void *arg)
 		 */
 		spin_lock(&dev_priv->irq_lock);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false,
-					  "Command parser error, iir 0x%08x",
-					  iir);
+			DRM_DEBUG("Command parser error, iir 0x%08x", iir);
 
 		for_each_pipe(dev_priv, pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3929,9 +3922,7 @@  static irqreturn_t i915_irq_handler(int irq, void *arg)
 		 */
 		spin_lock(&dev_priv->irq_lock);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false,
-					  "Command parser error, iir 0x%08x",
-					  iir);
+			DRM_DEBUG("Command parser error, iir 0x%08x", iir);
 
 		for_each_pipe(dev_priv, pipe) {
 			int reg = PIPESTAT(pipe);
@@ -4156,9 +4147,7 @@  static irqreturn_t i965_irq_handler(int irq, void *arg)
 		 */
 		spin_lock(&dev_priv->irq_lock);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false,
-					  "Command parser error, iir 0x%08x",
-					  iir);
+			DRM_DEBUG("Command parser error, iir 0x%08x", iir);
 
 		for_each_pipe(dev_priv, pipe) {
 			int reg = PIPESTAT(pipe);