diff mbox

drm/i915/perf: fix perf stream opening lock

Message ID 20180228114548.9121-1-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin Feb. 28, 2018, 11:45 a.m. UTC
We're seeing on CI that some contexts don't have the programmed OA
period timer that directs the OA unit on how often to write reports.

The issue is that we're not holding the drm lock from when we edit the
context images down to when we set the exclusive_stream variable. This
leaves a window for the deferred context allocation to call
i915_oa_init_reg_state() that will not program the expected OA timer
value, because we haven't set the exclusive_stream yet.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 701f8231a2f ("drm/i915/perf: prune OA configs")
Cc: <stable@vger.kernel.org> # v4.14+
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102254
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103715
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103755
---
 drivers/gpu/drm/i915/i915_perf.c | 41 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

Comments

Matthew Auld Feb. 28, 2018, 6:10 p.m. UTC | #1
On 28 February 2018 at 11:45, Lionel Landwerlin
<lionel.g.landwerlin@intel.com> wrote:
> We're seeing on CI that some contexts don't have the programmed OA
> period timer that directs the OA unit on how often to write reports.
>
> The issue is that we're not holding the drm lock from when we edit the
> context images down to when we set the exclusive_stream variable. This
> leaves a window for the deferred context allocation to call
> i915_oa_init_reg_state() that will not program the expected OA timer
> value, because we haven't set the exclusive_stream yet.
>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Fixes: 701f8231a2f ("drm/i915/perf: prune OA configs")
> Cc: <stable@vger.kernel.org> # v4.14+
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102254
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103715
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103755
> ---
>  drivers/gpu/drm/i915/i915_perf.c | 41 +++++++++++++++++++---------------------
>  1 file changed, 19 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 2741b1bc7095..7016abfc8ba9 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1757,21 +1757,16 @@ static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_pr
>   */
>  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>                                        const struct i915_oa_config *oa_config,
> -                                      bool interruptible)
> +                                      bool need_lock)
>  {
>         struct i915_gem_context *ctx;
>         int ret;
>         unsigned int wait_flags = I915_WAIT_LOCKED;
>
> -       if (interruptible) {
> -               ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> -               if (ret)
> -                       return ret;
> -
> -               wait_flags |= I915_WAIT_INTERRUPTIBLE;
> -       } else {
> +       if (need_lock)
>                 mutex_lock(&dev_priv->drm.struct_mutex);
> -       }
> +
> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
>
>         /* Switch away from any user context. */
>         ret = gen8_switch_to_updated_kernel_context(dev_priv, oa_config);
> @@ -1819,7 +1814,8 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>         }
>
>   out:
> -       mutex_unlock(&dev_priv->drm.struct_mutex);
> +       if (need_lock)
> +               mutex_unlock(&dev_priv->drm.struct_mutex);
>
>         return ret;
>  }
> @@ -1863,7 +1859,7 @@ static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
>          * to make sure all slices/subslices are ON before writing to NOA
>          * registers.
>          */
> -       ret = gen8_configure_all_contexts(dev_priv, oa_config, true);
> +       ret = gen8_configure_all_contexts(dev_priv, oa_config, false);
>         if (ret)
>                 return ret;
>
> @@ -1878,7 +1874,7 @@ static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
>  static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
>  {
>         /* Reset all contexts' slices/subslices configurations. */
> -       gen8_configure_all_contexts(dev_priv, NULL, false);
> +       gen8_configure_all_contexts(dev_priv, NULL, true);

Maybe move the ops.disable_metric_set() to also be within the lock, so
need_lock=false, then we can get rid of need_lock?

>
>         I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
>                                       ~GT_NOA_ENABLE));
> @@ -1888,7 +1884,7 @@ static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
>  static void gen10_disable_metric_set(struct drm_i915_private *dev_priv)
>  {
>         /* Reset all contexts' slices/subslices configurations. */
> -       gen8_configure_all_contexts(dev_priv, NULL, false);
> +       gen8_configure_all_contexts(dev_priv, NULL, true);
>
>         /* Make sure we disable noa to save power. */
>         I915_WRITE(RPM_CONFIG1,
> @@ -2138,13 +2134,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>         if (ret)
>                 goto err_oa_buf_alloc;
>
> -       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
> -                                                     stream->oa_config);
> -       if (ret)
> -               goto err_enable;
> -
> -       stream->ops = &i915_oa_stream_ops;
> -
>         /* Lock device for exclusive_stream access late because
>          * enable_metric_set() might lock as well on gen8+.
>          */

Ok, we can get rid of this comment now.

> @@ -2152,16 +2141,24 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>         if (ret)
>                 goto err_lock;
>
> +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
> +                                                     stream->oa_config);
> +       if (ret)
> +               goto err_enable;
> +
> +       stream->ops = &i915_oa_stream_ops;
> +
>         dev_priv->perf.oa.exclusive_stream = stream;
>
>         mutex_unlock(&dev_priv->drm.struct_mutex);
>
>         return 0;
>
> -err_lock:
> +err_enable:
> +       mutex_unlock(&dev_priv->drm.struct_mutex);
>         dev_priv->perf.oa.ops.disable_metric_set(dev_priv);

We can drop the disable_metric_set here; no need to disable it if we
never enabled it.

Nice catch,
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Chris Wilson March 1, 2018, 8:18 a.m. UTC | #2
Quoting Lionel Landwerlin (2018-02-28 11:45:48)
> We're seeing on CI that some contexts don't have the programmed OA
> period timer that directs the OA unit on how often to write reports.
> 
> The issue is that we're not holding the drm lock from when we edit the
> context images down to when we set the exclusive_stream variable. This
> leaves a window for the deferred context allocation to call
> i915_oa_init_reg_state() that will not program the expected OA timer
> value, because we haven't set the exclusive_stream yet.

Thank you for this quick explanation. After staring at the diff to
figure out what race you saw, the succinct explanation of the window
between altering all previous contexts and marking up new contexts,
enlightens.

> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Fixes: 701f8231a2f ("drm/i915/perf: prune OA configs")
> Cc: <stable@vger.kernel.org> # v4.14+
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102254
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103715
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103755

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
Lionel Landwerlin March 1, 2018, 10:30 a.m. UTC | #3
On 28/02/18 18:10, Matthew Auld wrote:
> On 28 February 2018 at 11:45, Lionel Landwerlin
> <lionel.g.landwerlin@intel.com> wrote:
>> We're seeing on CI that some contexts don't have the programmed OA
>> period timer that directs the OA unit on how often to write reports.
>>
>> The issue is that we're not holding the drm lock from when we edit the
>> context images down to when we set the exclusive_stream variable. This
>> leaves a window for the deferred context allocation to call
>> i915_oa_init_reg_state() that will not program the expected OA timer
>> value, because we haven't set the exclusive_stream yet.
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
>> Fixes: 701f8231a2f ("drm/i915/perf: prune OA configs")
>> Cc: <stable@vger.kernel.org> # v4.14+
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102254
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103715
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103755
>> ---
>>   drivers/gpu/drm/i915/i915_perf.c | 41 +++++++++++++++++++---------------------
>>   1 file changed, 19 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 2741b1bc7095..7016abfc8ba9 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -1757,21 +1757,16 @@ static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_pr
>>    */
>>   static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>>                                         const struct i915_oa_config *oa_config,
>> -                                      bool interruptible)
>> +                                      bool need_lock)
>>   {
>>          struct i915_gem_context *ctx;
>>          int ret;
>>          unsigned int wait_flags = I915_WAIT_LOCKED;
>>
>> -       if (interruptible) {
>> -               ret = i915_mutex_lock_interruptible(&dev_priv->drm);
>> -               if (ret)
>> -                       return ret;
>> -
>> -               wait_flags |= I915_WAIT_INTERRUPTIBLE;
>> -       } else {
>> +       if (need_lock)
>>                  mutex_lock(&dev_priv->drm.struct_mutex);
>> -       }
>> +
>> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>
>>          /* Switch away from any user context. */
>>          ret = gen8_switch_to_updated_kernel_context(dev_priv, oa_config);
>> @@ -1819,7 +1814,8 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>>          }
>>
>>    out:
>> -       mutex_unlock(&dev_priv->drm.struct_mutex);
>> +       if (need_lock)
>> +               mutex_unlock(&dev_priv->drm.struct_mutex);
>>
>>          return ret;
>>   }
>> @@ -1863,7 +1859,7 @@ static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
>>           * to make sure all slices/subslices are ON before writing to NOA
>>           * registers.
>>           */
>> -       ret = gen8_configure_all_contexts(dev_priv, oa_config, true);
>> +       ret = gen8_configure_all_contexts(dev_priv, oa_config, false);
>>          if (ret)
>>                  return ret;
>>
>> @@ -1878,7 +1874,7 @@ static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
>>   static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
>>   {
>>          /* Reset all contexts' slices/subslices configurations. */
>> -       gen8_configure_all_contexts(dev_priv, NULL, false);
>> +       gen8_configure_all_contexts(dev_priv, NULL, true);
> Maybe move the ops.disable_metric_set() to also be within the lock, so
> need_lock=false, then we can get rid of need_lock?

Yeah, sounds like a nice cleanup.

>
>>          I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
>>                                        ~GT_NOA_ENABLE));
>> @@ -1888,7 +1884,7 @@ static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
>>   static void gen10_disable_metric_set(struct drm_i915_private *dev_priv)
>>   {
>>          /* Reset all contexts' slices/subslices configurations. */
>> -       gen8_configure_all_contexts(dev_priv, NULL, false);
>> +       gen8_configure_all_contexts(dev_priv, NULL, true);
>>
>>          /* Make sure we disable noa to save power. */
>>          I915_WRITE(RPM_CONFIG1,
>> @@ -2138,13 +2134,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>          if (ret)
>>                  goto err_oa_buf_alloc;
>>
>> -       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
>> -                                                     stream->oa_config);
>> -       if (ret)
>> -               goto err_enable;
>> -
>> -       stream->ops = &i915_oa_stream_ops;
>> -
>>          /* Lock device for exclusive_stream access late because
>>           * enable_metric_set() might lock as well on gen8+.
>>           */
> Ok, we can get rid of this comment now.

Thanks, removing.

>
>> @@ -2152,16 +2141,24 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>>          if (ret)
>>                  goto err_lock;
>>
>> +       ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
>> +                                                     stream->oa_config);
>> +       if (ret)
>> +               goto err_enable;
>> +
>> +       stream->ops = &i915_oa_stream_ops;
>> +
>>          dev_priv->perf.oa.exclusive_stream = stream;
>>
>>          mutex_unlock(&dev_priv->drm.struct_mutex);
>>
>>          return 0;
>>
>> -err_lock:
>> +err_enable:
>> +       mutex_unlock(&dev_priv->drm.struct_mutex);
>>          dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
> We can drop the disable_metric_set here; no need to disable it if we
> never enabled it.

Well in this case we did, we edited all the context images.
Better cleanup out changes.

>
> Nice catch,
> Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 2741b1bc7095..7016abfc8ba9 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1757,21 +1757,16 @@  static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_pr
  */
 static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 				       const struct i915_oa_config *oa_config,
-				       bool interruptible)
+				       bool need_lock)
 {
 	struct i915_gem_context *ctx;
 	int ret;
 	unsigned int wait_flags = I915_WAIT_LOCKED;
 
-	if (interruptible) {
-		ret = i915_mutex_lock_interruptible(&dev_priv->drm);
-		if (ret)
-			return ret;
-
-		wait_flags |= I915_WAIT_INTERRUPTIBLE;
-	} else {
+	if (need_lock)
 		mutex_lock(&dev_priv->drm.struct_mutex);
-	}
+
+	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
 	/* Switch away from any user context. */
 	ret = gen8_switch_to_updated_kernel_context(dev_priv, oa_config);
@@ -1819,7 +1814,8 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	}
 
  out:
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	if (need_lock)
+		mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	return ret;
 }
@@ -1863,7 +1859,7 @@  static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
 	 * to make sure all slices/subslices are ON before writing to NOA
 	 * registers.
 	 */
-	ret = gen8_configure_all_contexts(dev_priv, oa_config, true);
+	ret = gen8_configure_all_contexts(dev_priv, oa_config, false);
 	if (ret)
 		return ret;
 
@@ -1878,7 +1874,7 @@  static int gen8_enable_metric_set(struct drm_i915_private *dev_priv,
 static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
 {
 	/* Reset all contexts' slices/subslices configurations. */
-	gen8_configure_all_contexts(dev_priv, NULL, false);
+	gen8_configure_all_contexts(dev_priv, NULL, true);
 
 	I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
 				      ~GT_NOA_ENABLE));
@@ -1888,7 +1884,7 @@  static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
 static void gen10_disable_metric_set(struct drm_i915_private *dev_priv)
 {
 	/* Reset all contexts' slices/subslices configurations. */
-	gen8_configure_all_contexts(dev_priv, NULL, false);
+	gen8_configure_all_contexts(dev_priv, NULL, true);
 
 	/* Make sure we disable noa to save power. */
 	I915_WRITE(RPM_CONFIG1,
@@ -2138,13 +2134,6 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (ret)
 		goto err_oa_buf_alloc;
 
-	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
-						      stream->oa_config);
-	if (ret)
-		goto err_enable;
-
-	stream->ops = &i915_oa_stream_ops;
-
 	/* Lock device for exclusive_stream access late because
 	 * enable_metric_set() might lock as well on gen8+.
 	 */
@@ -2152,16 +2141,24 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (ret)
 		goto err_lock;
 
+	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
+						      stream->oa_config);
+	if (ret)
+		goto err_enable;
+
+	stream->ops = &i915_oa_stream_ops;
+
 	dev_priv->perf.oa.exclusive_stream = stream;
 
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
 	return 0;
 
-err_lock:
+err_enable:
+	mutex_unlock(&dev_priv->drm.struct_mutex);
 	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
-err_enable:
+err_lock:
 	free_oa_buffer(dev_priv);
 
 err_oa_buf_alloc: