diff mbox

drm/i915: Fix context ban and hang accounting for client

Message ID 20180615101828.24195-1-mika.kuoppala@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala June 15, 2018, 10:18 a.m. UTC
If client is smart or lucky enough to create a new context
after each hang, our context banning mechanism will never
catch up, and as a result of that it will be saved from
client banning. This can result in a never ending streak of
gpu hangs caused by bad or malicious client, preventing
access from other legit gpu clients.

Fix this by always incrementing per client ban score if
it hangs in short successions regardless of context ban
scoring. The exception are non bannable contexts. They remain
detached from client ban scoring mechanism.

v2: xchg timestamp, tidyup (Chris)

Fixes: b083a0870c79 ("drm/i915: Add per client max context ban limit")
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         | 20 ++++++---
 drivers/gpu/drm/i915/i915_gem.c         | 57 +++++++++++++++++--------
 drivers/gpu/drm/i915/i915_gem_context.c |  2 +-
 3 files changed, 54 insertions(+), 25 deletions(-)

Comments

Chris Wilson June 15, 2018, 10:28 a.m. UTC | #1
Quoting Mika Kuoppala (2018-06-15 11:18:28)
> If client is smart or lucky enough to create a new context
> after each hang, our context banning mechanism will never
> catch up, and as a result of that it will be saved from
> client banning. This can result in a never ending streak of
> gpu hangs caused by bad or malicious client, preventing
> access from other legit gpu clients.
> 
> Fix this by always incrementing per client ban score if
> it hangs in short successions regardless of context ban
> scoring. The exception are non bannable contexts. They remain
> detached from client ban scoring mechanism.
> 
> v2: xchg timestamp, tidyup (Chris)
> 
> Fixes: b083a0870c79 ("drm/i915: Add per client max context ban limit")
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         | 20 ++++++---
>  drivers/gpu/drm/i915/i915_gem.c         | 57 +++++++++++++++++--------
>  drivers/gpu/drm/i915/i915_gem_context.c |  2 +-
>  3 files changed, 54 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 74dd88d8563e..93aa8e7dfaba 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -352,14 +352,20 @@ struct drm_i915_file_private {
>  
>         unsigned int bsd_engine;
>  
> -/* Client can have a maximum of 3 contexts banned before
> - * it is denied of creating new contexts. As one context
> - * ban needs 4 consecutive hangs, and more if there is
> - * progress in between, this is a last resort stop gap measure
> - * to limit the badly behaving clients access to gpu.
> +/* Every context ban increments per client ban score. Also

/*
 * Every

One day we'll have rewritten every line of code, and every comment.

> + * hangs in short succession increments ban score. If client suffers 3
> + * context bans, 9 hangs in quick succession or combination of those,

Leave out the numbers if possible, just explain the rationale of the
multilevel system.

> + * it is banned and submitting more work will fail. This is a stop gap
> + * measure to limit the badly behaving clients access to gpu.
> + * Note that unbannable contexts never increment the client ban score.
>   */
> -#define I915_MAX_CLIENT_CONTEXT_BANS 3
> -       atomic_t context_bans;
> +#define I915_CLIENT_SCORE_HANG_FAST    1
> +#define   I915_CLIENT_FAST_HANG_JIFFIES (60 * HZ)
> +#define I915_CLIENT_SCORE_CONTEXT_BAN   3
> +#define I915_CLIENT_SCORE_BANNED       9
> +       /** ban_score: Accumulated score of all ctx bans and fast hangs. */
> +       atomic_t ban_score;
> +       unsigned long hang_timestamp;
>  };
>  
>  /* Interface history:
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 8dd4d35655af..f06fe1c636e5 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2941,32 +2941,54 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
>         return 0;
>  }
>  
> +static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
> +                                       const struct i915_gem_context *ctx)
> +{
> +       unsigned int score;
> +       unsigned long prev_hang;
> +
> +       if (i915_gem_context_is_banned(ctx))
> +               score = I915_CLIENT_SCORE_CONTEXT_BAN;
> +       else
> +               score = 0;
> +
> +       prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
> +       if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
> +               score += I915_CLIENT_SCORE_HANG_FAST;
> +
> +       if (score) {
> +               atomic_add(score, &file_priv->ban_score);
> +
> +               DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
> +                                ctx->name, score,
> +                                atomic_read(&file_priv->ban_score));
> +       }
> +}
> +
>  static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  {
> -       bool banned;
> +       unsigned int score;
> +       bool banned, bannable;
>  
>         atomic_inc(&ctx->guilty_count);
>  
> -       banned = false;
> -       if (i915_gem_context_is_bannable(ctx)) {
> -               unsigned int score;
> +       bannable = i915_gem_context_is_bannable(ctx);
> +       score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> +       banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
>  
> -               score = atomic_add_return(CONTEXT_SCORE_GUILTY,
> -                                         &ctx->ban_score);
> -               banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +       DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, ban %s:%s\n",
> +                        ctx->name, atomic_read(&ctx->guilty_count),
> +                        score, yesno(bannable), yesno(banned));

ban: no:yes

Wut? Maybe just yesno(banned && bannable) as even debug messages
shouldn't strive to confuse us further.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
Mika Kuoppala June 15, 2018, 1:41 p.m. UTC | #2
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2018-06-15 11:18:28)
>> If client is smart or lucky enough to create a new context
>> after each hang, our context banning mechanism will never
>> catch up, and as a result of that it will be saved from
>> client banning. This can result in a never ending streak of
>> gpu hangs caused by bad or malicious client, preventing
>> access from other legit gpu clients.
>> 
>> Fix this by always incrementing per client ban score if
>> it hangs in short successions regardless of context ban
>> scoring. The exception are non bannable contexts. They remain
>> detached from client ban scoring mechanism.
>> 
>> v2: xchg timestamp, tidyup (Chris)
>> 
>> Fixes: b083a0870c79 ("drm/i915: Add per client max context ban limit")
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         | 20 ++++++---
>>  drivers/gpu/drm/i915/i915_gem.c         | 57 +++++++++++++++++--------
>>  drivers/gpu/drm/i915/i915_gem_context.c |  2 +-
>>  3 files changed, 54 insertions(+), 25 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 74dd88d8563e..93aa8e7dfaba 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -352,14 +352,20 @@ struct drm_i915_file_private {
>>  
>>         unsigned int bsd_engine;
>>  
>> -/* Client can have a maximum of 3 contexts banned before
>> - * it is denied of creating new contexts. As one context
>> - * ban needs 4 consecutive hangs, and more if there is
>> - * progress in between, this is a last resort stop gap measure
>> - * to limit the badly behaving clients access to gpu.
>> +/* Every context ban increments per client ban score. Also
>
> /*
>  * Every
>
> One day we'll have rewritten every line of code, and every comment.
>
>> + * hangs in short succession increments ban score. If client suffers 3
>> + * context bans, 9 hangs in quick succession or combination of those,
>
> Leave out the numbers if possible, just explain the rationale of the
> multilevel system.
>
>> + * it is banned and submitting more work will fail. This is a stop gap
>> + * measure to limit the badly behaving clients access to gpu.
>> + * Note that unbannable contexts never increment the client ban score.
>>   */
>> -#define I915_MAX_CLIENT_CONTEXT_BANS 3
>> -       atomic_t context_bans;
>> +#define I915_CLIENT_SCORE_HANG_FAST    1
>> +#define   I915_CLIENT_FAST_HANG_JIFFIES (60 * HZ)
>> +#define I915_CLIENT_SCORE_CONTEXT_BAN   3
>> +#define I915_CLIENT_SCORE_BANNED       9
>> +       /** ban_score: Accumulated score of all ctx bans and fast hangs. */
>> +       atomic_t ban_score;
>> +       unsigned long hang_timestamp;
>>  };
>>  
>>  /* Interface history:
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 8dd4d35655af..f06fe1c636e5 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2941,32 +2941,54 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
>>         return 0;
>>  }
>>  
>> +static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
>> +                                       const struct i915_gem_context *ctx)
>> +{
>> +       unsigned int score;
>> +       unsigned long prev_hang;
>> +
>> +       if (i915_gem_context_is_banned(ctx))
>> +               score = I915_CLIENT_SCORE_CONTEXT_BAN;
>> +       else
>> +               score = 0;
>> +
>> +       prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
>> +       if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
>> +               score += I915_CLIENT_SCORE_HANG_FAST;
>> +
>> +       if (score) {
>> +               atomic_add(score, &file_priv->ban_score);
>> +
>> +               DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
>> +                                ctx->name, score,
>> +                                atomic_read(&file_priv->ban_score));
>> +       }
>> +}
>> +
>>  static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>>  {
>> -       bool banned;
>> +       unsigned int score;
>> +       bool banned, bannable;
>>  
>>         atomic_inc(&ctx->guilty_count);
>>  
>> -       banned = false;
>> -       if (i915_gem_context_is_bannable(ctx)) {
>> -               unsigned int score;
>> +       bannable = i915_gem_context_is_bannable(ctx);
>> +       score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
>> +       banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
>>  
>> -               score = atomic_add_return(CONTEXT_SCORE_GUILTY,
>> -                                         &ctx->ban_score);
>> -               banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
>> +       DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, ban %s:%s\n",
>> +                        ctx->name, atomic_read(&ctx->guilty_count),
>> +                        score, yesno(bannable), yesno(banned));
>
> ban: no:yes
>
> Wut? Maybe just yesno(banned && bannable) as even debug messages
> shouldn't strive to confuse us further.
>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

Fixed the above and pushed. Thanks for review!
-Mika
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 74dd88d8563e..93aa8e7dfaba 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -352,14 +352,20 @@  struct drm_i915_file_private {
 
 	unsigned int bsd_engine;
 
-/* Client can have a maximum of 3 contexts banned before
- * it is denied of creating new contexts. As one context
- * ban needs 4 consecutive hangs, and more if there is
- * progress in between, this is a last resort stop gap measure
- * to limit the badly behaving clients access to gpu.
+/* Every context ban increments per client ban score. Also
+ * hangs in short succession increments ban score. If client suffers 3
+ * context bans, 9 hangs in quick succession or combination of those,
+ * it is banned and submitting more work will fail. This is a stop gap
+ * measure to limit the badly behaving clients access to gpu.
+ * Note that unbannable contexts never increment the client ban score.
  */
-#define I915_MAX_CLIENT_CONTEXT_BANS 3
-	atomic_t context_bans;
+#define I915_CLIENT_SCORE_HANG_FAST	1
+#define   I915_CLIENT_FAST_HANG_JIFFIES (60 * HZ)
+#define I915_CLIENT_SCORE_CONTEXT_BAN   3
+#define I915_CLIENT_SCORE_BANNED	9
+	/** ban_score: Accumulated score of all ctx bans and fast hangs. */
+	atomic_t ban_score;
+	unsigned long hang_timestamp;
 };
 
 /* Interface history:
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8dd4d35655af..f06fe1c636e5 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2941,32 +2941,54 @@  i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
+static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
+					const struct i915_gem_context *ctx)
+{
+	unsigned int score;
+	unsigned long prev_hang;
+
+	if (i915_gem_context_is_banned(ctx))
+		score = I915_CLIENT_SCORE_CONTEXT_BAN;
+	else
+		score = 0;
+
+	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
+	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
+		score += I915_CLIENT_SCORE_HANG_FAST;
+
+	if (score) {
+		atomic_add(score, &file_priv->ban_score);
+
+		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
+				 ctx->name, score,
+				 atomic_read(&file_priv->ban_score));
+	}
+}
+
 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
-	bool banned;
+	unsigned int score;
+	bool banned, bannable;
 
 	atomic_inc(&ctx->guilty_count);
 
-	banned = false;
-	if (i915_gem_context_is_bannable(ctx)) {
-		unsigned int score;
+	bannable = i915_gem_context_is_bannable(ctx);
+	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
+	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
 
-		score = atomic_add_return(CONTEXT_SCORE_GUILTY,
-					  &ctx->ban_score);
-		banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+	DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, ban %s:%s\n",
+			 ctx->name, atomic_read(&ctx->guilty_count),
+			 score, yesno(bannable), yesno(banned));
 
-		DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
-				 ctx->name, score, yesno(banned));
-	}
-	if (!banned)
+	/* Cool contexts don't accumulate client ban score */
+	if (!bannable)
 		return;
 
-	i915_gem_context_set_banned(ctx);
-	if (!IS_ERR_OR_NULL(ctx->file_priv)) {
-		atomic_inc(&ctx->file_priv->context_bans);
-		DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
-				 ctx->name, atomic_read(&ctx->file_priv->context_bans));
-	}
+	if (banned)
+		i915_gem_context_set_banned(ctx);
+
+	if (!IS_ERR_OR_NULL(ctx->file_priv))
+		i915_gem_client_mark_guilty(ctx->file_priv, ctx);
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
@@ -5820,6 +5842,7 @@  int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
 	INIT_LIST_HEAD(&file_priv->mm.request_list);
 
 	file_priv->bsd_engine = -1;
+	file_priv->hang_timestamp = jiffies;
 
 	ret = i915_gem_context_open(i915, file);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index ef6ea4bcd773..ccf463ab6562 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -708,7 +708,7 @@  int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
 
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
 {
-	return atomic_read(&file_priv->context_bans) > I915_MAX_CLIENT_CONTEXT_BANS;
+	return atomic_read(&file_priv->ban_score) >= I915_CLIENT_SCORE_BANNED;
 }
 
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,