[2/2] drm/i915: Make hangcheck logging more compact

Message ID	1431092395-23930-2-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Mika Kuoppala <mika.kuoppala@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Fri, 8 May 2015 16:39:55 +0300 Message-Id: <1431092395-23930-2-git-send-email-mika.kuoppala@intel.com> In-Reply-To: <1431092395-23930-1-git-send-email-mika.kuoppala@intel.com> References: <1431092395-23930-1-git-send-email-mika.kuoppala@intel.com> Cc: miku@iki.fi Subject: [Intel-gfx] [PATCH 2/2] drm/i915: Make hangcheck logging more compact Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Message ID

1431092395-23930-2-git-send-email-mika.kuoppala@intel.com (mailing list archive)

State

New, archived

Headers

From: Mika Kuoppala <mika.kuoppala@linux.intel.com>
To: intel-gfx@lists.freedesktop.org
Date: Fri,  8 May 2015 16:39:55 +0300
Message-Id: <1431092395-23930-2-git-send-email-mika.kuoppala@intel.com>
In-Reply-To: <1431092395-23930-1-git-send-email-mika.kuoppala@intel.com>
References: <1431092395-23930-1-git-send-email-mika.kuoppala@intel.com>
Cc: miku@iki.fi
Subject: [Intel-gfx] [PATCH 2/2] drm/i915: Make hangcheck logging more
	compact
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: base64
Errors-To: intel-gfx-bounces@lists.freedesktop.org
Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

Commit Message

Mika Kuoppala May 8, 2015, 1:39 p.m. UTC

With commit aaecdf611a05 ("drm/i915: Stop gathering error
states for CS error interrupts") we only call i915_handle_error()
on call sites where there is a stuck/hung gpu. So there is
no more need to carry around extra information into dmesg.

Emit one loud bang into dmesg with first hanging ring as
culprit. Rest of the details will be in error state.

Based-on-patch-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gpu_error.c |  4 +---
 drivers/gpu/drm/i915/i915_irq.c       | 26 ++++++++------------------
 2 files changed, 9 insertions(+), 21 deletions(-)

Comments

Shuang He May 8, 2015, 11:55 p.m. UTC | #1

Tested-By: Intel Graphics QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 6357
-------------------------------------Summary-------------------------------------
Platform          Delta          drm-intel-nightly          Series Applied
PNV                                  276/276              276/276
ILK                 -1              302/302              301/302
SNB                                  316/316              316/316
IVB                                  342/342              342/342
BYT                                  286/286              286/286
BDW                                  321/321              321/321
-------------------------------------Detailed-------------------------------------
Platform  Test                                drm-intel-nightly          Series Applied
*ILK  igt@kms_flip@vblank-vs-hang      PASS(2)      TIMEOUT(2)
Note: You need to pay more attention to line start with '*'

Tomas Elf May 19, 2015, 11:25 a.m. UTC | #2

On 08/05/2015 14:39, Mika Kuoppala wrote:
> With commit aaecdf611a05 ("drm/i915: Stop gathering error
> states for CS error interrupts") we only call i915_handle_error()
> on call sites where there is a stuck/hung gpu. So there is
> no more need to carry around extra information into dmesg.
>
> Emit one loud bang into dmesg with first hanging ring as
> culprit. Rest of the details will be in error state.
>
> Based-on-patch-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gpu_error.c |  4 +---
>   drivers/gpu/drm/i915/i915_irq.c       | 26 ++++++++------------------
>   2 files changed, 9 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 9c0db19..292cf1f 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1251,9 +1251,7 @@ static void i915_error_capture_msg(struct drm_device *dev,
>   				 error->ring[ring_id].pid);
>
>   	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
> -		  ", reason: %s, action: %s",
> -		  error_msg,
> -		  wedged ? "reset" : "continue");
> +		  ", %s", error_msg);
>   }
>

Once you've removed the reference to the wedged parameter from the 
scnprintf statement I can't see any other references to it anywhere else 
in the function. How about we remove that parameter entirely from the 
function signature?

Thanks,
Tomas

>   static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index a3244bd..a3b5001 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2924,14 +2924,12 @@ static bool check_for_missed_irq(struct intel_engine_cs *ring)
>   	return true;
>   }
>
> -static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
> +static void hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
>   {
>   #define BUSY 1
>   #define KICK 5
>   #define HUNG 20
> -
>   	struct intel_ring_hangcheck *hc = &ring->hangcheck;
> -	bool there_is_hope = true;
>
>   	/* We always increment the hangcheck score
>   	 * if the ring is busy and still processing
> @@ -2964,11 +2962,8 @@ static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
>   		break;
>   	case HANGCHECK_HUNG:
>   		hc->score += HUNG;
> -		there_is_hope = false;
>   		break;
>   	}
> -
> -	return there_is_hope;
>   }
>
>   /*
> @@ -2987,8 +2982,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   	struct drm_device *dev = dev_priv->dev;
>   	struct intel_engine_cs *ring;
>   	int i;
> -	int busy_count = 0, rings_hung = 0;
> -	bool stuck[I915_NUM_RINGS] = { 0 };
> +	int busy_count = 0, ring_hung = -1;
>
>   	if (!i915.enable_hangcheck)
>   		return;
> @@ -3043,19 +3037,15 @@ engine_check_done:
>   		hc->acthd = acthd;
>   		hc->start = start;
>   		busy_count += busy;
> -	}
>
> -	for_each_ring(ring, dev_priv, i) {
> -		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
> -			DRM_INFO("%s on %s\n",
> -				 stuck[i] ? "stuck" : "no progress",
> -				 ring->name);
> -			rings_hung++;
> -		}
> +		if (ring_hung == -1 &&
> +		    ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
> +			ring_hung = i;
>   	}
>
> -	if (rings_hung)
> -		return i915_handle_error(dev, true, "Ring hung");
> +	if (ring_hung != -1)
> +		return i915_handle_error(dev, true, "%s hung",
> +					 dev_priv->ring[ring_hung].name);
>
>   	if (busy_count)
>   		/* Reset timer case chip hangs without another request
>

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 9c0db19..292cf1f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1251,9 +1251,7 @@  static void i915_error_capture_msg(struct drm_device *dev,
 				 error->ring[ring_id].pid);
 
 	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-		  ", reason: %s, action: %s",
-		  error_msg,
-		  wedged ? "reset" : "continue");
+		  ", %s", error_msg);
 }
 
 static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index a3244bd..a3b5001 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2924,14 +2924,12 @@  static bool check_for_missed_irq(struct intel_engine_cs *ring)
 	return true;
 }
 
-static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
+static void hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
 {
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
-
 	struct intel_ring_hangcheck *hc = &ring->hangcheck;
-	bool there_is_hope = true;
 
 	/* We always increment the hangcheck score
 	 * if the ring is busy and still processing
@@ -2964,11 +2962,8 @@  static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
 		break;
 	case HANGCHECK_HUNG:
 		hc->score += HUNG;
-		there_is_hope = false;
 		break;
 	}
-
-	return there_is_hope;
 }
 
 /*
@@ -2987,8 +2982,7 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 	struct drm_device *dev = dev_priv->dev;
 	struct intel_engine_cs *ring;
 	int i;
-	int busy_count = 0, rings_hung = 0;
-	bool stuck[I915_NUM_RINGS] = { 0 };
+	int busy_count = 0, ring_hung = -1;
 
 	if (!i915.enable_hangcheck)
 		return;
@@ -3043,19 +3037,15 @@  engine_check_done:
 		hc->acthd = acthd;
 		hc->start = start;
 		busy_count += busy;
-	}
 
-	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-			DRM_INFO("%s on %s\n",
-				 stuck[i] ? "stuck" : "no progress",
-				 ring->name);
-			rings_hung++;
-		}
+		if (ring_hung == -1 &&
+		    ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG)
+			ring_hung = i;
 	}
 
-	if (rings_hung)
-		return i915_handle_error(dev, true, "Ring hung");
+	if (ring_hung != -1)
+		return i915_handle_error(dev, true, "%s hung",
+					 dev_priv->ring[ring_hung].name);
 
 	if (busy_count)
 		/* Reset timer case chip hangs without another request