diff mbox series

[v11,09/10] drm/i915/perf: execute OA configuration from command stream

Message ID 20190828143327.7965-10-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Vulkan performance query support | expand

Commit Message

Lionel Landwerlin Aug. 28, 2019, 2:33 p.m. UTC
We haven't run into issues with programming the global OA/NOA
registers configuration from CPU so far, but HW engineers actually
recommend doing this from the command streamer. On TGL in particular
one of the clock domain in which some of that programming goes might
not be powered when we poke things from the CPU.

Since we have a command buffer prepared for the execbuffer side of
things, we can reuse that approach here too.

This also allows us to significantly reduce the amount of time we hold
the main lock.

v2: Drop the global lock as much as possible

v3: Take global lock to pin global

v4: Create i915 request in emit_oa_config() to avoid deadlocks (Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  12 +++
 drivers/gpu/drm/i915/i915_perf.c | 146 +++++++++++++++++++------------
 2 files changed, 104 insertions(+), 54 deletions(-)

Comments

Chris Wilson Aug. 28, 2019, 7:41 p.m. UTC | #1
Quoting Lionel Landwerlin (2019-08-28 15:33:26)
> +       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
> +       if (IS_ERR(rq))
> +               return PTR_ERR(rq);
> +
> +       err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
> +                                     rq);

I am still not sold on putting the global oa_config timeline on the
engine, and still dislike the suggestion of using struct_mutex to
management it.

You will notice that i915_active_request_set() checks that you have the
mutex held that your initialised with (in this case that is a *NULL).
-Chris
Lionel Landwerlin Aug. 29, 2019, 6:45 a.m. UTC | #2
On 28/08/2019 22:41, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2019-08-28 15:33:26)
>> +       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
>> +       if (IS_ERR(rq))
>> +               return PTR_ERR(rq);
>> +
>> +       err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
>> +                                     rq);
> I am still not sold on putting the global oa_config timeline on the
> engine, and still dislike the suggestion of using struct_mutex to
> management it.


Where else could we put it?


>
> You will notice that i915_active_request_set() checks that you have the
> mutex held that your initialised with (in this case that is a *NULL).
> -Chris
>

Will fix that, thanks.


-Lionel
Lionel Landwerlin Aug. 29, 2019, 6:58 a.m. UTC | #3
On 29/08/2019 09:45, Lionel Landwerlin wrote:
> On 28/08/2019 22:41, Chris Wilson wrote:
>> Quoting Lionel Landwerlin (2019-08-28 15:33:26)
>>> +       rq = i915_request_create(i915->engine[RCS0]->kernel_context);
>>> +       if (IS_ERR(rq))
>>> +               return PTR_ERR(rq);
>>> +
>>> +       err = 
>>> i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
>>> +                                     rq);
>> I am still not sold on putting the global oa_config timeline on the
>> engine, and still dislike the suggestion of using struct_mutex to
>> management it.
>
>
> Where else could we put it?


Note that I should probably move the exclusive_stream on the engine too.

The idea being that this should be attached to the engine where part of 
the OA unit is located.


-Lionel

>
>
>>
>> You will notice that i915_active_request_set() checks that you have the
>> mutex held that your initialised with (in this case that is a *NULL).
>> -Chris
>>
>
> Will fix that, thanks.
>
>
> -Lionel
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ec609c022e5e..f94de001201d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1094,6 +1094,18 @@  struct i915_perf_stream {
 	 */
 	intel_wakeref_t wakeref;
 
+	/**
+	 * @initial_config_rq: First request run at the opening of the i915
+	 * perf stream to configure the HW. Should be NULL after the perf
+	 * stream has been opened successfully.
+	 */
+	struct i915_request *initial_config_rq;
+
+	/**
+	 * @initial_oa_config_bo: First OA configuration BO to be run.
+	 */
+	struct drm_i915_gem_object *initial_oa_config_bo;
+
 	/**
 	 * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*`
 	 * properties given when opening a stream, representing the contents
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 1eeabf0bbafc..29eba8c3c792 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1783,6 +1783,10 @@  static int alloc_noa_wait(struct i915_perf_stream *stream)
 		return PTR_ERR(bo);
 	}
 
+	ret = i915_mutex_lock_interruptible(&i915->drm);
+	if (ret)
+		goto err_unref;
+
 	/*
 	 * We pin in GGTT because we jump into this buffer now because
 	 * multiple OA config BOs will have a jump to this address and it
@@ -1790,10 +1794,13 @@  static int alloc_noa_wait(struct i915_perf_stream *stream)
 	 */
 	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0);
 	if (IS_ERR(vma)) {
+		mutex_unlock(&i915->drm.struct_mutex);
 		ret = PTR_ERR(vma);
 		goto err_unref;
 	}
 
+	mutex_unlock(&i915->drm.struct_mutex);
+
 	batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
 	if (IS_ERR(batch)) {
 		ret = PTR_ERR(batch);
@@ -1927,7 +1934,9 @@  static int alloc_noa_wait(struct i915_perf_stream *stream)
 	return 0;
 
 err_unpin:
-	__i915_vma_unpin(vma);
+	mutex_lock(&i915->drm.struct_mutex);
+	i915_vma_unpin_and_release(&vma, 0);
+	mutex_unlock(&i915->drm.struct_mutex);
 
 err_unref:
 	i915_gem_object_put(bo);
@@ -1935,50 +1944,71 @@  static int alloc_noa_wait(struct i915_perf_stream *stream)
 	return ret;
 }
 
-static void config_oa_regs(struct drm_i915_private *dev_priv,
-			   const struct i915_oa_reg *regs,
-			   u32 n_regs)
+static int emit_oa_config(struct drm_i915_private *i915,
+			  struct i915_perf_stream *stream)
 {
-	u32 i;
+	struct i915_request *rq;
+	struct i915_vma *vma;
+	u32 *cs;
+	int err;
 
-	for (i = 0; i < n_regs; i++) {
-		const struct i915_oa_reg *reg = regs + i;
+	lockdep_assert_held(&i915->drm.struct_mutex);
 
-		I915_WRITE(reg->addr, reg->value);
+	rq = i915_request_create(i915->engine[RCS0]->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	err = i915_active_request_set(&i915->engine[RCS0]->last_oa_config,
+				      rq);
+	if (err)
+		goto err_add_request;
+
+	vma = i915_vma_instance(stream->initial_oa_config_bo,
+				&i915->ggtt.vm, NULL);
+	if (unlikely(IS_ERR(vma))) {
+		err = PTR_ERR(vma);
+		goto err_add_request;
 	}
-}
 
-static void delay_after_mux(void)
-{
-	/*
-	 * It apparently takes a fairly long time for a new MUX
-	 * configuration to be be applied after these register writes.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
-	 */
-	usleep_range(15000, 20000);
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+	if (err)
+		goto err_add_request;
+
+	err = i915_vma_move_to_active(vma, rq, 0);
+	if (err)
+		goto err_vma_unpin;
+
+	cs = intel_ring_begin(rq, INTEL_GEN(i915) >= 8 ? 4 : 2);
+	if (IS_ERR(cs)) {
+		err = PTR_ERR(cs);
+		goto err_vma_unpin;
+	}
+
+	if (INTEL_GEN(i915) > 8) {
+		*cs++ = MI_BATCH_BUFFER_START_GEN8;
+		*cs++ = lower_32_bits(vma->node.start);
+		*cs++ = upper_32_bits(vma->node.start);
+		*cs++ = MI_NOOP;
+	} else {
+		*cs++ = MI_BATCH_BUFFER_START;
+		*cs++ = vma->node.start;
+	}
+
+	intel_ring_advance(rq, cs);
+
+	stream->initial_config_rq = i915_request_get(rq);
+
+err_vma_unpin:
+	i915_vma_unpin(vma);
+err_add_request:
+	i915_request_add(rq);
+
+	return err;
 }
 
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	const struct i915_oa_config *oa_config = stream->oa_config;
 
 	/*
 	 * PRM:
@@ -1995,13 +2025,7 @@  static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 	I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
 				  GEN6_CSUNIT_CLOCK_GATE_DISABLE));
 
-	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-	delay_after_mux();
-
-	config_oa_regs(dev_priv, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return emit_oa_config(dev_priv, stream);
 }
 
 static void hsw_disable_metric_set(struct i915_perf_stream *stream)
@@ -2360,13 +2384,7 @@  static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	if (ret)
 		return ret;
 
-	config_oa_regs(dev_priv, oa_config->mux_regs, oa_config->mux_regs_len);
-	delay_after_mux();
-
-	config_oa_regs(dev_priv, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-
-	return 0;
+	return emit_oa_config(dev_priv, stream);
 }
 
 static void gen8_disable_metric_set(struct i915_perf_stream *stream)
@@ -2542,6 +2560,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int format_size;
+	long timeout;
 	int ret;
 
 	/* If the sysfs metrics/ directory wasn't registered for some
@@ -2611,8 +2630,9 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		goto err_noa_wait_alloc;
 	}
 
-	ret = i915_perf_get_oa_config(dev_priv, props->metrics_set,
-				      &stream->oa_config);
+	ret = i915_perf_get_oa_config_and_bo(stream, props->metrics_set,
+					     &stream->oa_config,
+					     &stream->initial_oa_config_bo);
 	if (ret) {
 		DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
 		goto err_config;
@@ -2637,22 +2657,34 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	if (ret)
 		goto err_oa_buf_alloc;
 
+	stream->ops = &i915_oa_stream_ops;
+
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
 		goto err_lock;
 
-	stream->ops = &i915_oa_stream_ops;
 	dev_priv->perf.exclusive_stream = stream;
 
 	ret = dev_priv->perf.ops.enable_metric_set(stream);
+	mutex_unlock(&dev_priv->drm.struct_mutex);
 	if (ret) {
 		DRM_DEBUG("Unable to enable metric set\n");
 		goto err_enable;
 	}
 
-	DRM_DEBUG("opening stream oa config uuid=%s\n", stream->oa_config->uuid);
+	timeout = i915_request_wait(stream->initial_config_rq,
+				    I915_WAIT_INTERRUPTIBLE,
+				    MAX_SCHEDULE_TIMEOUT);
+	i915_request_put(stream->initial_config_rq);
+	i915_gem_object_put(stream->initial_oa_config_bo);
+	stream->initial_config_rq = NULL;
+	stream->initial_oa_config_bo = NULL;
 
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	ret = timeout < 0 ? timeout : 0;
+	if (ret)
+		goto err_enable;
+
+	DRM_DEBUG("opening stream oa config uuid=%s\n", stream->oa_config->uuid);
 
 	hrtimer_init(&stream->poll_check_timer,
 		     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -2663,6 +2695,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	return 0;
 
 err_enable:
+	mutex_lock(&dev_priv->drm.struct_mutex);
 	dev_priv->perf.exclusive_stream = NULL;
 	dev_priv->perf.ops.disable_metric_set(stream);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
@@ -2674,6 +2707,11 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
 	intel_runtime_pm_put(&dev_priv->runtime_pm, stream->wakeref);
 
+	free_oa_configs(stream);
+
+	i915_gem_object_put(stream->initial_oa_config_bo);
+	i915_request_put(stream->initial_config_rq);
+
 err_config:
 	free_noa_wait(stream);