diff mbox

[v2,3/3] drm/i915: Clean up GPU hang message

Message ID 1449874764-18735-3-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 11, 2015, 10:59 p.m. UTC
Remove some redundant kernel messages as we deduce a hung GPU and
capture the error state.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_irq.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

Comments

Dave Gordon Dec. 14, 2015, 11:28 a.m. UTC | #1
On 11/12/15 22:59, Chris Wilson wrote:
> Remove some redundant kernel messages as we deduce a hung GPU and
> capture the error state.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_irq.c | 16 ++++++----------
>   1 file changed, 6 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 4cfbd694b3a8..365d4872a67d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2963,7 +2963,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	struct drm_device *dev = dev_priv->dev;
>   	struct intel_engine_cs *ring;
>   	int i;
> -	int busy_count = 0, rings_hung = 0;
> +	int busy_count = 0;
>   	bool stuck[I915_NUM_RINGS] = { 0 };
>   #define BUSY 1
>   #define KICK 5
> @@ -3057,17 +3057,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	}
>
>   	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
> -			DRM_INFO("%s on %s\n",
> -				 stuck[i] ? "stuck" : "no progress",
> -				 ring->name);
> -			rings_hung++;
> -		}
> +		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
> +			return i915_handle_error(dev, true,
> +						 "%s on %s",
> +						 stuck[i] ? "No progress" : "Hang",
> +						 ring->name);
>   	}
>
> -	if (rings_hung)
> -		return i915_handle_error(dev, true, "Ring hung");
> -
>   	/* Reset timer in case GPU hangs without another request being added */
>   	if (busy_count)
>   		i915_queue_hangcheck(dev_priv);

This version provides less information (in dmesg & syslog) in the case 
that multiple rings have (been detected as) hung. Does this ever happen? 
Is the original more useful for finding bugs in hangcheck itself? If 
this is a test that has disabled error state capture (because it's 
*trying* to hang one or more rings) can we still know how many rings 
have been diagnosed as hung?

.Dave.
Chris Wilson Dec. 14, 2015, 11:39 a.m. UTC | #2
On Mon, Dec 14, 2015 at 11:28:39AM +0000, Dave Gordon wrote:
> On 11/12/15 22:59, Chris Wilson wrote:
> >Remove some redundant kernel messages as we deduce a hung GPU and
> >capture the error state.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_irq.c | 16 ++++++----------
> >  1 file changed, 6 insertions(+), 10 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >index 4cfbd694b3a8..365d4872a67d 100644
> >--- a/drivers/gpu/drm/i915/i915_irq.c
> >+++ b/drivers/gpu/drm/i915/i915_irq.c
> >@@ -2963,7 +2963,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >  	struct drm_device *dev = dev_priv->dev;
> >  	struct intel_engine_cs *ring;
> >  	int i;
> >-	int busy_count = 0, rings_hung = 0;
> >+	int busy_count = 0;
> >  	bool stuck[I915_NUM_RINGS] = { 0 };
> >  #define BUSY 1
> >  #define KICK 5
> >@@ -3057,17 +3057,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >  	}
> >
> >  	for_each_ring(ring, dev_priv, i) {
> >-		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
> >-			DRM_INFO("%s on %s\n",
> >-				 stuck[i] ? "stuck" : "no progress",
> >-				 ring->name);
> >-			rings_hung++;
> >-		}
> >+		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
> >+			return i915_handle_error(dev, true,
> >+						 "%s on %s",
> >+						 stuck[i] ? "No progress" : "Hang",
> >+						 ring->name);
> >  	}
> >
> >-	if (rings_hung)
> >-		return i915_handle_error(dev, true, "Ring hung");
> >-
> >  	/* Reset timer in case GPU hangs without another request being added */
> >  	if (busy_count)
> >  		i915_queue_hangcheck(dev_priv);
> 
> This version provides less information (in dmesg & syslog) in the
> case that multiple rings have (been detected as) hung. Does this
> ever happen?

Not often. And intended, since that information is already in the error
state.

> Is the original more useful for finding bugs in
> hangcheck itself?

No. See i915_hangcheck_info.

> If this is a test that has disabled error state
> capture (because it's *trying* to hang one or more rings) can we
> still know how many rings have been diagnosed as hung?

If you have a use case, then you can look at the interface you require.
i915_hangcheck_info should be sufficient in most cases, or at least a
good starting point. But you may want a more specific debugfs to avoid
parsing.
-Chris
Chris Wilson Dec. 14, 2015, 1:45 p.m. UTC | #3
On Mon, Dec 14, 2015 at 11:39:47AM +0000, Chris Wilson wrote:
> On Mon, Dec 14, 2015 at 11:28:39AM +0000, Dave Gordon wrote:
> > If this is a test that has disabled error state
> > capture (because it's *trying* to hang one or more rings) can we
> > still know how many rings have been diagnosed as hung?
> 
> If you have a use case, then you can look at the interface you require.
> i915_hangcheck_info should be sufficient in most cases, or at least a
> good starting point. But you may want a more specific debugfs to avoid
> parsing.

Gah, even better is just to use the reset-stats ioctl!
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 4cfbd694b3a8..365d4872a67d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2963,7 +2963,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	struct drm_device *dev = dev_priv->dev;
 	struct intel_engine_cs *ring;
 	int i;
-	int busy_count = 0, rings_hung = 0;
+	int busy_count = 0;
 	bool stuck[I915_NUM_RINGS] = { 0 };
 #define BUSY 1
 #define KICK 5
@@ -3057,17 +3057,13 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-			DRM_INFO("%s on %s\n",
-				 stuck[i] ? "stuck" : "no progress",
-				 ring->name);
-			rings_hung++;
-		}
+		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
+			return i915_handle_error(dev, true,
+						 "%s on %s",
+						 stuck[i] ? "No progress" : "Hang",
+						 ring->name);
 	}
 
-	if (rings_hung)
-		return i915_handle_error(dev, true, "Ring hung");
-
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
 		i915_queue_hangcheck(dev_priv);