diff mbox

drm/i915: Make i915_gem_context_mark_guilty() safe for unlocked updates

Message ID 20170709093513.26913-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson July 9, 2017, 9:35 a.m. UTC
Since we make call i915_gem_context_mark_guilty() concurrently when
resetting different engines in parallel, we need to make sure that our
updates are safe for the unlocked access.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  2 +-
 drivers/gpu/drm/i915/i915_gem.c         | 32 ++++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_gem_context.c |  6 +++---
 drivers/gpu/drm/i915/i915_gem_context.h |  6 +++---
 drivers/gpu/drm/i915/i915_gem_request.c |  3 +--
 drivers/gpu/drm/i915/i915_gpu_error.c   |  8 ++++----
 6 files changed, 30 insertions(+), 27 deletions(-)

Comments

Mika Kuoppala July 10, 2017, 2:14 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since we make call i915_gem_context_mark_guilty() concurrently when
> resetting different engines in parallel, we need to make sure that our
> updates are safe for the unlocked access.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

And we dont ever clear the context bannable flag so
this should be fine.

One improvement would be that only the triggering
ban would spew the message. But now thinking about it,
the dual message would reveal the concurrent ban and
is more informative.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c         | 32 ++++++++++++++++++--------------
>  drivers/gpu/drm/i915/i915_gem_context.c |  6 +++---
>  drivers/gpu/drm/i915/i915_gem_context.h |  6 +++---
>  drivers/gpu/drm/i915/i915_gem_request.c |  3 +--
>  drivers/gpu/drm/i915/i915_gpu_error.c   |  8 ++++----
>  6 files changed, 30 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 81cd21ecfa7d..62208b3bf417 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -596,7 +596,7 @@ struct drm_i915_file_private {
>   * to limit the badly behaving clients access to gpu.
>   */
>  #define I915_MAX_CLIENT_CONTEXT_BANS 3
> -	int context_bans;
> +	atomic_t context_bans;
>  };
>  
>  /* Used by dp and fdi links */
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1564cadda94d..cbe70a94b663 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2739,34 +2739,38 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
>  	return 0;
>  }
>  
> -static bool ban_context(const struct i915_gem_context *ctx)
> +static bool ban_context(const struct i915_gem_context *ctx,
> +			unsigned int score)
>  {
>  	return (i915_gem_context_is_bannable(ctx) &&
> -		ctx->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD);
> +		score >= CONTEXT_SCORE_BAN_THRESHOLD);
>  }
>  
>  static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  {
> -	ctx->guilty_count++;
> -	ctx->ban_score += CONTEXT_SCORE_GUILTY;
> -	if (ban_context(ctx))
> -		i915_gem_context_set_banned(ctx);
> +	unsigned int score;
> +	bool banned;
>  
> -	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
> -			 ctx->name, ctx->ban_score,
> -			 yesno(i915_gem_context_is_banned(ctx)));
> +	atomic_inc(&ctx->guilty_count);
>  
> -	if (!i915_gem_context_is_banned(ctx) || IS_ERR_OR_NULL(ctx->file_priv))
> +	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> +	banned = ban_context(ctx, score);
> +	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
> +			 ctx->name, score, yesno(banned));
> +	if (!banned)
>  		return;
>  
> -	ctx->file_priv->context_bans++;
> -	DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
> -			 ctx->name, ctx->file_priv->context_bans);
> +	i915_gem_context_set_banned(ctx);
> +	if (!IS_ERR_OR_NULL(ctx->file_priv)) {
> +		atomic_inc(&ctx->file_priv->context_bans);
> +		DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
> +				 ctx->name, atomic_read(&ctx->file_priv->context_bans));
> +	}
>  }
>  
>  static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
>  {
> -	ctx->active_count++;
> +	atomic_inc(&ctx->active_count);
>  }
>  
>  struct drm_i915_gem_request *
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 1a87d04e7937..ed91ac8ca832 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -977,7 +977,7 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv)
>  
>  static bool client_is_banned(struct drm_i915_file_private *file_priv)
>  {
> -	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
> +	return atomic_read(&file_priv->context_bans) > I915_MAX_CLIENT_CONTEXT_BANS;
>  }
>  
>  int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
> @@ -1179,8 +1179,8 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
>  	else
>  		args->reset_count = 0;
>  
> -	args->batch_active = READ_ONCE(ctx->guilty_count);
> -	args->batch_pending = READ_ONCE(ctx->active_count);
> +	args->batch_active = atomic_read(&ctx->guilty_count);
> +	args->batch_pending = atomic_read(&ctx->active_count);
>  
>  	ret = 0;
>  out:
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 04320f80f9f4..2d02918a449e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -191,17 +191,17 @@ struct i915_gem_context {
>  	u32 desc_template;
>  
>  	/** guilty_count: How many times this context has caused a GPU hang. */
> -	unsigned int guilty_count;
> +	atomic_t guilty_count;
>  	/**
>  	 * @active_count: How many times this context was active during a GPU
>  	 * hang, but did not cause it.
>  	 */
> -	unsigned int active_count;
> +	atomic_t active_count;
>  
>  #define CONTEXT_SCORE_GUILTY		10
>  #define CONTEXT_SCORE_BAN_THRESHOLD	40
>  	/** ban_score: Accumulated score of all hangs caused by this context. */
> -	int ban_score;
> +	atomic_t ban_score;
>  
>  	/** remap_slice: Bitmask of cache lines that need remapping */
>  	u8 remap_slice;
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 483af8921060..8ba89abe5441 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -370,8 +370,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  	i915_gem_request_remove_from_client(request);
>  
>  	/* Retirement decays the ban score as it is a sign of ctx progress */
> -	if (request->ctx->ban_score > 0)
> -		request->ctx->ban_score--;
> +	atomic_dec_if_positive(&request->ctx->ban_score);
>  
>  	/* The backing object for the context is done after switching to the
>  	 * *next* context. Therefore we cannot retire the previous context until
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index ae70283470a6..ed5a1eb839ad 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1266,7 +1266,7 @@ static void record_request(struct drm_i915_gem_request *request,
>  			   struct drm_i915_error_request *erq)
>  {
>  	erq->context = request->ctx->hw_id;
> -	erq->ban_score = request->ctx->ban_score;
> +	erq->ban_score = atomic_read(&request->ctx->ban_score);
>  	erq->seqno = request->global_seqno;
>  	erq->jiffies = request->emitted_jiffies;
>  	erq->head = request->head;
> @@ -1357,9 +1357,9 @@ static void record_context(struct drm_i915_error_context *e,
>  
>  	e->handle = ctx->user_handle;
>  	e->hw_id = ctx->hw_id;
> -	e->ban_score = ctx->ban_score;
> -	e->guilty = ctx->guilty_count;
> -	e->active = ctx->active_count;
> +	e->ban_score = atomic_read(&ctx->ban_score);
> +	e->guilty = atomic_read(&ctx->guilty_count);
> +	e->active = atomic_read(&ctx->active_count);
>  }
>  
>  static void request_record_user_bo(struct drm_i915_gem_request *request,
> -- 
> 2.13.2
Chris Wilson July 11, 2017, 11:27 a.m. UTC | #2
Quoting Mika Kuoppala (2017-07-10 15:14:19)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Since we make call i915_gem_context_mark_guilty() concurrently when
> > resetting different engines in parallel, we need to make sure that our
> > updates are safe for the unlocked access.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Michel Thierry <michel.thierry@intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> 
> And we dont ever clear the context bannable flag so
> this should be fine.
> 
> One improvement would be that only the triggering
> ban would spew the message. But now thinking about it,
> the dual message would reveal the concurrent ban and
> is more informative.

It only tells you that we encountered a race condition in handling a
reset on two engines. A double ban message is going to be more confusing
than anything.

Did I send the patch to add the user message for the per-engine reset? I
was having withdrawal symptoms from not seeing it before mesa died.

Otoh, we clearly do propagate the error to userspace, but on the other
hand I'm used to seeing it. It's akin to the kernel reporting when it
sends a terminal signal to a process. I've been trying to find that to
see if they discuss the same issue and in particular why its hidden
behind kconfig and sysctl.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 81cd21ecfa7d..62208b3bf417 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -596,7 +596,7 @@  struct drm_i915_file_private {
  * to limit the badly behaving clients access to gpu.
  */
 #define I915_MAX_CLIENT_CONTEXT_BANS 3
-	int context_bans;
+	atomic_t context_bans;
 };
 
 /* Used by dp and fdi links */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1564cadda94d..cbe70a94b663 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2739,34 +2739,38 @@  i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static bool ban_context(const struct i915_gem_context *ctx)
+static bool ban_context(const struct i915_gem_context *ctx,
+			unsigned int score)
 {
 	return (i915_gem_context_is_bannable(ctx) &&
-		ctx->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD);
+		score >= CONTEXT_SCORE_BAN_THRESHOLD);
 }
 
 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
-	ctx->guilty_count++;
-	ctx->ban_score += CONTEXT_SCORE_GUILTY;
-	if (ban_context(ctx))
-		i915_gem_context_set_banned(ctx);
+	unsigned int score;
+	bool banned;
 
-	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
-			 ctx->name, ctx->ban_score,
-			 yesno(i915_gem_context_is_banned(ctx)));
+	atomic_inc(&ctx->guilty_count);
 
-	if (!i915_gem_context_is_banned(ctx) || IS_ERR_OR_NULL(ctx->file_priv))
+	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
+	banned = ban_context(ctx, score);
+	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
+			 ctx->name, score, yesno(banned));
+	if (!banned)
 		return;
 
-	ctx->file_priv->context_bans++;
-	DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
-			 ctx->name, ctx->file_priv->context_bans);
+	i915_gem_context_set_banned(ctx);
+	if (!IS_ERR_OR_NULL(ctx->file_priv)) {
+		atomic_inc(&ctx->file_priv->context_bans);
+		DRM_DEBUG_DRIVER("client %s has had %d context banned\n",
+				 ctx->name, atomic_read(&ctx->file_priv->context_bans));
+	}
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
 {
-	ctx->active_count++;
+	atomic_inc(&ctx->active_count);
 }
 
 struct drm_i915_gem_request *
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1a87d04e7937..ed91ac8ca832 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -977,7 +977,7 @@  int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv)
 
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
 {
-	return file_priv->context_bans > I915_MAX_CLIENT_CONTEXT_BANS;
+	return atomic_read(&file_priv->context_bans) > I915_MAX_CLIENT_CONTEXT_BANS;
 }
 
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
@@ -1179,8 +1179,8 @@  int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
 	else
 		args->reset_count = 0;
 
-	args->batch_active = READ_ONCE(ctx->guilty_count);
-	args->batch_pending = READ_ONCE(ctx->active_count);
+	args->batch_active = atomic_read(&ctx->guilty_count);
+	args->batch_pending = atomic_read(&ctx->active_count);
 
 	ret = 0;
 out:
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 04320f80f9f4..2d02918a449e 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -191,17 +191,17 @@  struct i915_gem_context {
 	u32 desc_template;
 
 	/** guilty_count: How many times this context has caused a GPU hang. */
-	unsigned int guilty_count;
+	atomic_t guilty_count;
 	/**
 	 * @active_count: How many times this context was active during a GPU
 	 * hang, but did not cause it.
 	 */
-	unsigned int active_count;
+	atomic_t active_count;
 
 #define CONTEXT_SCORE_GUILTY		10
 #define CONTEXT_SCORE_BAN_THRESHOLD	40
 	/** ban_score: Accumulated score of all hangs caused by this context. */
-	int ban_score;
+	atomic_t ban_score;
 
 	/** remap_slice: Bitmask of cache lines that need remapping */
 	u8 remap_slice;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 483af8921060..8ba89abe5441 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -370,8 +370,7 @@  static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	i915_gem_request_remove_from_client(request);
 
 	/* Retirement decays the ban score as it is a sign of ctx progress */
-	if (request->ctx->ban_score > 0)
-		request->ctx->ban_score--;
+	atomic_dec_if_positive(&request->ctx->ban_score);
 
 	/* The backing object for the context is done after switching to the
 	 * *next* context. Therefore we cannot retire the previous context until
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ae70283470a6..ed5a1eb839ad 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1266,7 +1266,7 @@  static void record_request(struct drm_i915_gem_request *request,
 			   struct drm_i915_error_request *erq)
 {
 	erq->context = request->ctx->hw_id;
-	erq->ban_score = request->ctx->ban_score;
+	erq->ban_score = atomic_read(&request->ctx->ban_score);
 	erq->seqno = request->global_seqno;
 	erq->jiffies = request->emitted_jiffies;
 	erq->head = request->head;
@@ -1357,9 +1357,9 @@  static void record_context(struct drm_i915_error_context *e,
 
 	e->handle = ctx->user_handle;
 	e->hw_id = ctx->hw_id;
-	e->ban_score = ctx->ban_score;
-	e->guilty = ctx->guilty_count;
-	e->active = ctx->active_count;
+	e->ban_score = atomic_read(&ctx->ban_score);
+	e->guilty = atomic_read(&ctx->guilty_count);
+	e->active = atomic_read(&ctx->active_count);
 }
 
 static void request_record_user_bo(struct drm_i915_gem_request *request,