diff mbox

[3/6] drm/i915: Use request retirement as context progress

Message ID 1479309634-28574-3-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala Nov. 16, 2016, 3:20 p.m. UTC
As hangcheck score was removed, the active decay of score
was removed also. This removed feature for hangcheck to detect
if the gpu client was accidentally or maliciously causing intermittent
hangs. Reinstate the scoring as a per context property, so that if
one context starts to act unfavourably, ban it.

v2: ban_period_secs as a gate to score check (Chris)
v3: decay in proper spot. scores as tunables (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  5 ++++
 drivers/gpu/drm/i915/i915_gem.c         | 44 ++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_request.c |  4 +++
 3 files changed, 39 insertions(+), 14 deletions(-)

Comments

Chris Wilson Nov. 16, 2016, 5:08 p.m. UTC | #1
On Wed, Nov 16, 2016 at 05:20:31PM +0200, Mika Kuoppala wrote:
> As hangcheck score was removed, the active decay of score
> was removed also. This removed feature for hangcheck to detect
> if the gpu client was accidentally or maliciously causing intermittent
> hangs. Reinstate the scoring as a per context property, so that if
> one context starts to act unfavourably, ban it.
> 
> v2: ban_period_secs as a gate to score check (Chris)
> v3: decay in proper spot. scores as tunables (Chris)
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4562a39..9f24957 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -914,6 +914,11 @@  struct i915_ctx_hang_stats {
 
 	/* This context is banned to submit more work */
 	bool banned;
+
+#define CONTEXT_SCORE_GUILTY		10
+#define CONTEXT_SCORE_BAN_THRESHOLD	40
+	/* Accumulated score of hangs caused by this context */
+	int ban_score;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ae2a219..5948f09 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2620,33 +2620,45 @@  void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
 
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
+	const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
 	unsigned long elapsed;
 
-	if (ctx->hang_stats.banned)
+	if (hs->banned)
 		return true;
 
-	elapsed = get_seconds() - ctx->hang_stats.guilty_ts;
-	if (ctx->hang_stats.ban_period_seconds &&
-	    elapsed <= ctx->hang_stats.ban_period_seconds) {
+	if (!hs->ban_period_seconds)
+		return false;
+
+	elapsed = get_seconds() - hs->guilty_ts;
+	if (elapsed <= hs->ban_period_seconds) {
 		DRM_DEBUG("context hanging too fast, banning!\n");
 		return true;
 	}
 
+	if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
+		DRM_DEBUG("context hanging too often, banning!\n");
+		return true;
+	}
+
 	return false;
 }
 
-static void i915_set_reset_status(struct i915_gem_context *ctx,
-				  const bool guilty)
+static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
 	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
 
-	if (guilty) {
-		hs->banned = i915_context_is_banned(ctx);
-		hs->batch_active++;
-		hs->guilty_ts = get_seconds();
-	} else {
-		hs->batch_pending++;
-	}
+	hs->ban_score += CONTEXT_SCORE_GUILTY;
+
+	hs->banned = i915_context_is_banned(ctx);
+	hs->batch_active++;
+	hs->guilty_ts = get_seconds();
+}
+
+static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
+{
+	struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
+
+	hs->batch_pending++;
 }
 
 struct drm_i915_gem_request *
@@ -2714,7 +2726,11 @@  static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 		ring_hung = false;
 	}
 
-	i915_set_reset_status(request->ctx, ring_hung);
+	if (ring_hung)
+		i915_gem_context_mark_guilty(request->ctx);
+	else
+		i915_gem_context_mark_innocent(request->ctx);
+
 	if (!ring_hung)
 		return;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index b9b5253..b31d18e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -255,6 +255,10 @@  static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 					       request->engine);
 	}
 
+	/* Retirement decays the ban score as it is a sign of ctx progress */
+	if (request->ctx->hang_stats.ban_score > 0)
+		request->ctx->hang_stats.ban_score--;
+
 	i915_gem_context_put(request->ctx);
 
 	dma_fence_signal(&request->fence);