diff mbox

[v4,6/8] drm/i915: reprogram NOA muxes on context switch when using perf

Message ID 20180509174851.13847-7-lionel.g.landwerlin@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Lionel Landwerlin May 9, 2018, 5:48 p.m. UTC
If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes. We need to reprogram them
when we detect a powergating configuration change.

In this change i915/perf is responsible for setting up a reprogramming
batchbuffer which we execute just before the userspace submitted
batchbuffer. We do this while preemption is still disable, only if
needed. The decision to execute this reprogramming batchbuffer is made
when we assign a request to an execlist port.

v2: Only reprogram when detecting configuration changes (Chris/Lionel)

v3: Clear engine sseu tracking on execlists cancel port (Chris)
    Store NOA reprogramming vma on the engine (Chris/Lionel)
    Use PIPECONTROL MMIO write correctly, on the last register write (Chris/Lionel)
    Pin NOA reprogramming vma with PIN_USER only (Chris)
    Program MI_BATCH_BUFFER_START into NOA reprogramming correctly (Chris)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
---
 drivers/gpu/drm/i915/i915_perf.c        | 135 ++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_request.c     |   2 +
 drivers/gpu/drm/i915/i915_request.h     |  11 ++
 drivers/gpu/drm/i915/intel_engine_cs.c  |   2 +
 drivers/gpu/drm/i915/intel_lrc.c        |  57 +++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  14 +++
 6 files changed, 220 insertions(+), 1 deletion(-)

Comments

Lionel Landwerlin May 9, 2018, 11:38 p.m. UTC | #1
On 09/05/18 18:48, Lionel Landwerlin wrote:
> @@ -1953,10 +1992,26 @@ static int gen8_emit_bb_start(struct i915_request *rq,
>   		rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
>   	}
>   
> -	cs = intel_ring_begin(rq, 6);
> +	cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
>   	if (IS_ERR(cs))
>   		return PTR_ERR(cs);
>   
> +	if (rq->engine->id == RCS) {
> +		/*
> +		 * Leave some instructions to be written with an
> +		 * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
> +		 * batchbuffer. We only turn those MI_NOOP into
> +		 * MI_BATCH_BUFFER_START when we detect a SSEU powergating
> +		 * configuration change that might affect NOA. This is only
> +		 * for the RCS.
> +		 */
> +		rq->perf_prog = intel_ring_offset(rq, cs);
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP; /* Aligning to 2 dwords */
> +	}
> +
I just realized that isn't going to work if a request is preempted, then 
later resubmitted after another context with a different powergating 
config...
This reprog bb won't be executed because the CS pointer should be past 
that point already.
It seems to make this approach unworkable?

Would a per-ctx-wa-bb call into the reprogramming buffer under 
MI_PREDICATE work?
LOAD rpcs into predicate_reg0
LOAD engine storage for last rpcs into prediate_reg1
PREDICATE reg0 == reg1
MI_LRI noa registers
PREDICATE unset
STORE rpcs into engine storage for last rpcs

Thanks,

-
Lionel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5b279a82445a..66a8f296290a 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1691,6 +1691,122 @@  static int gen8_emit_oa_config(struct i915_request *rq,
 	return 0;
 }
 
+#define MAX_LRI_SIZE (125U)
+
+static u32 noa_reprogram_bb_size(struct drm_i915_private *dev_priv,
+				 const struct i915_oa_config *oa_config)
+{
+	u32 n_lri_mux_regs;
+	u32 n_lri;
+
+	/* Very unlikely but possible that we have no muxes to configure. */
+	if (!oa_config->mux_regs_len)
+		return 0;
+
+	n_lri_mux_regs = oa_config->mux_regs_len - 1;
+
+	n_lri = (n_lri_mux_regs / MAX_LRI_SIZE) +
+		(n_lri_mux_regs % MAX_LRI_SIZE) != 0;
+
+	return n_lri * 4 + n_lri_mux_regs * 8 + /* MI_LOAD_REGISTER_IMMs */
+		gen8_lri_pipe_control_len(dev_priv) + /* PIPE_CONTROL */
+		4; /* MI_BATCH_BUFFER_END */
+}
+
+static struct i915_vma *
+alloc_noa_reprogram_bo(struct drm_i915_private *dev_priv,
+		       const struct i915_oa_config *oa_config)
+{
+	struct drm_i915_gem_object *bo;
+	struct i915_vma *vma;
+	u32 buffer_size, pc_flags;
+	u32 *cs;
+	int i, ret, last_reg, n_loaded_regs;
+
+	buffer_size =
+		ALIGN(noa_reprogram_bb_size(dev_priv, oa_config), PAGE_SIZE);
+	if (buffer_size == 0)
+		return NULL;
+
+	bo = i915_gem_object_create(dev_priv, buffer_size);
+	if (IS_ERR(bo)) {
+		DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
+		ret = PTR_ERR(bo);
+	}
+
+	cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(cs)) {
+		ret = PTR_ERR(cs);
+		goto err_unref_bo;
+	}
+
+	n_loaded_regs = 0;
+	last_reg = oa_config->mux_regs_len - 1;
+	for (i = 0; i < last_reg; i++) {
+		if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
+			u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
+					MAX_LRI_SIZE);
+			*cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+		}
+
+		*cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
+		*cs++ = oa_config->mux_regs[i].value;
+		n_loaded_regs++;
+	}
+
+	pc_flags = PIPE_CONTROL_CS_STALL;
+	/*
+	 * Project: PRE-SKL
+	 *
+	 *  Command Streamer Stall Enable:
+	 *
+	 *  "One of the following must also be set:
+	 *     - Render Target Cache Flush Enable
+	 *     - Dpeth Cache Flush Enable
+	 *     - Stall at Pixel Scoreboard
+	 *     - Depth Stall
+	 *     - Post-Sync Operation
+	 *     - DC FlushEnable"
+	 *
+	 *  Since we only do NOA reprogramming on Gen8+, this is the only Gen
+	 *  where we need to apply this.
+	 */
+	if (IS_GEN8(dev_priv, 8))
+		pc_flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+
+	/* Serialize on the last MMIO write. */
+	cs = gen8_emit_lri_pipe_control(dev_priv, cs, pc_flags,
+					i915_mmio_reg_offset(oa_config->mux_regs[last_reg].addr),
+					oa_config->mux_regs[last_reg].value);
+
+	*cs++ = MI_BATCH_BUFFER_END;
+
+	i915_gem_object_unpin_map(bo);
+
+	ret = i915_gem_object_set_to_gtt_domain(bo, false);
+	if (ret)
+		goto err_unref_bo;
+
+	vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto err_unref_bo;
+	}
+
+	ret = i915_vma_pin(vma, 0, 0, PIN_USER);
+	if (ret)
+		goto err_unref_vma;
+
+	return vma;
+
+err_unref_vma:
+	i915_vma_put(vma);
+err_unref_bo:
+	i915_gem_object_put(bo);
+
+	return ERR_PTR(ret);
+}
+
 static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
 						 const struct i915_oa_config *oa_config)
 {
@@ -1784,6 +1900,25 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	if (ret)
 		return ret;
 
+	/*
+	 * Powergating configuration changes will loose some of the NOA
+	 * programming. Set a NOA reprogramming BO for the engine to execute
+	 * when a powergating configuration change is detected.
+	 */
+	if (oa_config) {
+		struct i915_vma *reprog_vma =
+			alloc_noa_reprogram_bo(dev_priv, oa_config);
+
+		if (IS_ERR(reprog_vma)) {
+			DRM_DEBUG("Unable to alloc NOA reprogramming BO\n");
+			return ret;
+		}
+		engine->noa_reprogram_vma = reprog_vma;
+	} else {
+		i915_vma_unpin_and_release(&engine->noa_reprogram_vma);
+		engine->noa_reprogram_vma = NULL;
+	}
+
 	/* Update all contexts now that we've stalled the submission. */
 	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
 		struct intel_context *ce = to_intel_context(ctx, engine);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 8928894dd9c7..dd0b37e0a85c 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -786,6 +786,8 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->capture_list = NULL;
 	rq->waitboost = false;
 
+	rq->sseu = ctx->__engine[engine->id].sseu;
+
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
 	 * eventually emit this request. This is to guarantee that the
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index beb312ac9aa0..b4191d382145 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -162,6 +162,17 @@  struct i915_request {
 	/** Preallocate space in the ring for the emitting the request */
 	u32 reserved_space;
 
+	/*
+	 * Position in the ring batchbuffer to where the i915/perf NOA
+	 * reprogramming can be inserted just before HW submission.
+	 */
+	u32 perf_prog;
+
+	/*
+	 * Powergating configuration associated with this request.
+	 */
+	union intel_sseu sseu;
+
 	/** Batch buffer related to this request if any (used for
 	 * error state dump only).
 	 */
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 70325e0824e3..1bab0447c9dc 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -504,6 +504,8 @@  void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
 	i915_timeline_init(engine->i915, &engine->timeline, engine->name);
 
+	memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
+
 	intel_engine_init_execlist(engine);
 	intel_engine_init_hangcheck(engine);
 	intel_engine_init_batch_pool(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index a608ff0f9e7a..c9a51185b7fe 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -513,6 +513,37 @@  static bool can_merge_ctx(const struct i915_gem_context *prev,
 	return true;
 }
 
+static void maybe_enable_noa_reprogram(struct i915_request *rq)
+{
+	struct intel_engine_cs *engine = rq->engine;
+	u32 *cs;
+
+	/* Slice/subslice/EU powergating only matters on the RCS. */
+	if (engine->id != RCS)
+		return;
+
+	/*
+	 * If the i915 perf stream is not enabled or it doesn't source any
+	 * data from the NOA muxes, we won't have anything to reconfigure.
+	 */
+	if (!engine->noa_reprogram_vma)
+		return;
+
+	/*
+	 * If the powergating configuration doesn't change, no need to
+	 * reprogram.
+	 */
+	if (engine->last_sseu.value == rq->sseu.value)
+		return;
+
+	cs = rq->ring->vaddr + rq->perf_prog;
+	*cs++ = MI_BATCH_BUFFER_START_GEN8;
+	*cs++ = lower_32_bits(engine->noa_reprogram_vma->node.start);
+	*cs++ = upper_32_bits(engine->noa_reprogram_vma->node.start);
+
+	engine->last_sseu = rq->sseu;
+}
+
 static void port_assign(struct execlist_port *port, struct i915_request *rq)
 {
 	GEM_BUG_ON(rq == port_request(port));
@@ -520,6 +551,8 @@  static void port_assign(struct execlist_port *port, struct i915_request *rq)
 	if (port_isset(port))
 		i915_request_put(port_request(port));
 
+	maybe_enable_noa_reprogram(rq);
+
 	port_set(port, port_pack(i915_request_get(rq), port_count(port)));
 }
 
@@ -801,6 +834,12 @@  execlists_cancel_port_requests(struct intel_engine_cs *engine)
 	}
 
 	execlists_user_end(execlists);
+
+	/*
+	 * Clear out the state of the sseu on the engine, as it's not clear
+	 * what it will be after preemption.
+	 */
+	engine->last_sseu.value = 0;
 }
 
 static void clear_gtiir(struct intel_engine_cs *engine)
@@ -1953,10 +1992,26 @@  static int gen8_emit_bb_start(struct i915_request *rq,
 		rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
 	}
 
-	cs = intel_ring_begin(rq, 6);
+	cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
+	if (rq->engine->id == RCS) {
+		/*
+		 * Leave some instructions to be written with an
+		 * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
+		 * batchbuffer. We only turn those MI_NOOP into
+		 * MI_BATCH_BUFFER_START when we detect a SSEU powergating
+		 * configuration change that might affect NOA. This is only
+		 * for the RCS.
+		 */
+		rq->perf_prog = intel_ring_offset(rq, cs);
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP; /* Aligning to 2 dwords */
+	}
+
 	/*
 	 * WaDisableCtxRestoreArbitration:bdw,chv
 	 *
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 1d00cc3cc1a4..955518a5396f 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -343,6 +343,20 @@  struct intel_engine_cs {
 
 	struct drm_i915_gem_object *default_state;
 
+	/**
+	 * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used
+	 * after switching powergating configurations. This field is only
+	 * assigned by i915/perf after calling i915_gem_wait_for_idle() and
+	 * while holding the device's lock.
+	 */
+	struct i915_vma *noa_reprogram_vma;
+
+	/**
+	 * @last_sseu: The last SSEU configuration submitted to the
+	 * hardware. Set to 0 if unknown.
+	 */
+	union intel_sseu last_sseu;
+
 	atomic_t irq_count;
 	unsigned long irq_posted;
 #define ENGINE_IRQ_BREADCRUMB 0