diff mbox series

[v2] drm/i915/oa: Reconfigure contexts on the fly

Message ID 20190705131642.9246-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [v2] drm/i915/oa: Reconfigure contexts on the fly | expand

Commit Message

Chris Wilson July 5, 2019, 1:16 p.m. UTC
Avoid a global idle barrier by reconfiguring each context by rewriting
them with MI_STORE_DWORD from the kernel context.

v2: We only need to determine the desired register values once, they are
the same for all contexts.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c |   2 +
 drivers/gpu/drm/i915/gt/intel_lrc.c         |   7 +-
 drivers/gpu/drm/i915/i915_perf.c            | 244 +++++++++++++++-----
 3 files changed, 190 insertions(+), 63 deletions(-)

Comments

Lionel Landwerlin July 5, 2019, 1:29 p.m. UTC | #1
Looks good, probably best to have someone more familiar with the i915 
codebase look at it too.

Thanks a bunch!

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>

On 05/07/2019 16:16, Chris Wilson wrote:
> Avoid a global idle barrier by reconfiguring each context by rewriting
> them with MI_STORE_DWORD from the kernel context.
>
> v2: We only need to determine the desired register values once, they are
> the same for all contexts.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c |   2 +
>   drivers/gpu/drm/i915/gt/intel_lrc.c         |   7 +-
>   drivers/gpu/drm/i915/i915_perf.c            | 244 +++++++++++++++-----
>   3 files changed, 190 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index e367dce2a696..1f0d10bb88c1 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -624,7 +624,9 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
>   	ctx->sched.priority = I915_USER_PRIORITY(prio);
>   	ctx->ring_size = PAGE_SIZE;
>   
> +	/* Isolate the kernel context from prying eyes and sticky fingers */
>   	GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
> +	list_del_init(&ctx->link);
>   
>   	return ctx;
>   }
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index e1ae1399c72b..9cc5374401e1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1552,9 +1552,12 @@ __execlists_update_reg_state(struct intel_context *ce,
>   	regs[CTX_RING_TAIL + 1] = ring->tail;
>   
>   	/* RPCS */
> -	if (engine->class == RENDER_CLASS)
> +	if (engine->class == RENDER_CLASS) {
>   		regs[CTX_R_PWR_CLK_STATE + 1] =
>   			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
> +
> +		i915_oa_init_reg_state(engine, ce, regs);
> +	}
>   }
>   
>   static int
> @@ -2966,8 +2969,6 @@ static void execlists_init_reg_state(u32 *regs,
>   	if (rcs) {
>   		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
>   		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
> -
> -		i915_oa_init_reg_state(engine, ce, regs);
>   	}
>   
>   	regs[CTX_END] = MI_BATCH_BUFFER_END;
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 357e63beb373..8353589ee31b 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1630,6 +1630,27 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
>   				      ~GT_NOA_ENABLE));
>   }
>   
> +static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
> +			      i915_reg_t reg)
> +{
> +	u32 mmio = i915_mmio_reg_offset(reg);
> +	int i;
> +
> +	/*
> +	 * This arbitrary default will select the 'EU FPU0 Pipeline
> +	 * Active' event. In the future it's anticipated that there
> +	 * will be an explicit 'No Event' we can select, but not yet...
> +	 */
> +	if (!oa_config)
> +		return 0;
> +
> +	for (i = 0; i < oa_config->flex_regs_len; i++) {
> +		if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio)
> +			return oa_config->flex_regs[i].value;
> +	}
> +
> +	return 0;
> +}
>   /*
>    * NB: It must always remain pointer safe to run this even if the OA unit
>    * has been disabled.
> @@ -1663,28 +1684,8 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>   		GEN8_OA_COUNTER_RESUME);
>   
>   	for (i = 0; i < ARRAY_SIZE(flex_regs); i++) {
> -		u32 state_offset = ctx_flexeu0 + i * 2;
> -		u32 mmio = i915_mmio_reg_offset(flex_regs[i]);
> -
> -		/*
> -		 * This arbitrary default will select the 'EU FPU0 Pipeline
> -		 * Active' event. In the future it's anticipated that there
> -		 * will be an explicit 'No Event' we can select, but not yet...
> -		 */
> -		u32 value = 0;
> -
> -		if (oa_config) {
> -			u32 j;
> -
> -			for (j = 0; j < oa_config->flex_regs_len; j++) {
> -				if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
> -					value = oa_config->flex_regs[j].value;
> -					break;
> -				}
> -			}
> -		}
> -
> -		CTX_REG(reg_state, state_offset, flex_regs[i], value);
> +		CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i],
> +			oa_config_flex_reg(oa_config, flex_regs[i]));
>   	}
>   
>   	CTX_REG(reg_state,
> @@ -1692,6 +1693,107 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>   		intel_sseu_make_rpcs(i915, &ce->sseu));
>   }
>   
> +struct flex {
> +	i915_reg_t reg;
> +	u32 offset;
> +	u32 value;
> +};
> +
> +static int
> +gen8_store_flex(struct i915_request *rq,
> +		struct intel_context *ce,
> +		const struct flex *flex, unsigned int count)
> +{
> +	u32 offset;
> +	u32 *cs;
> +
> +	cs = intel_ring_begin(rq, 4 * count);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
> +	do {
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*cs++ = offset + (flex->offset + 1) * sizeof(u32);
> +		*cs++ = 0;
> +		*cs++ = flex->value;
> +	} while (flex++, --count);
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int
> +gen8_load_flex(struct i915_request *rq,
> +	       struct intel_context *ce,
> +	       const struct flex *flex, unsigned int count)
> +{
> +	u32 *cs;
> +
> +	GEM_BUG_ON(!count || count > 63);
> +
> +	cs = intel_ring_begin(rq, 2 * count + 2);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM(count);
> +	do {
> +		*cs++ = i915_mmio_reg_offset(flex->reg);
> +		*cs++ = flex->value;
> +	} while (flex++, --count);
> +	*cs++ = MI_NOOP;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static int gen8_modify_context(struct intel_context *ce,
> +			       const struct flex *flex, unsigned int count)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	lockdep_assert_held(&ce->pin_mutex);
> +
> +	rq = i915_request_create(ce->engine->kernel_context);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	/* Serialise with the remote context */
> +	err = i915_active_request_set(&ce->ring->timeline->last_request, rq);
> +	if (err)
> +		goto out_add;
> +
> +	/* Keep the remote context alive until after we finish editing */
> +	err = i915_active_ref(&ce->active, rq->fence.context, rq);
> +	if (err)
> +		goto out_add;
> +
> +	err = gen8_store_flex(rq, ce, flex, count);
> +
> +out_add:
> +	i915_request_add(rq);
> +	return err;
> +}
> +
> +static int gen8_modify_self(struct intel_context *ce,
> +			    const struct flex *flex, unsigned int count)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	rq = i915_request_create(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	err = gen8_load_flex(rq, ce, flex, count);
> +
> +	i915_request_add(rq);
> +	return err;
> +}
> +
>   /*
>    * Manages updating the per-context aspects of the OA stream
>    * configuration across all contexts.
> @@ -1716,15 +1818,43 @@ gen8_update_reg_state_unlocked(struct intel_context *ce,
>    *
>    * Note: it's only the RCS/Render context that has any OA state.
>    */
> -static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
> +static int gen8_configure_all_contexts(struct drm_i915_private *i915,
>   				       const struct i915_oa_config *oa_config)
>   {
> -	unsigned int map_type = i915_coherent_map_type(dev_priv);
> +	/* The MMIO offsets for Flex EU registers aren't contiguous */
> +	const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
> +#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
> +	struct flex regs[] = {
> +		{
> +			GEN8_R_PWR_CLK_STATE,
> +			CTX_R_PWR_CLK_STATE,
> +		},
> +		{
> +			GEN8_OACTXCONTROL,
> +			i915->perf.oa.ctx_oactxctrl_offset,
> +			((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
> +			 (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
> +			 GEN8_OA_COUNTER_RESUME)
> +		},
> +		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
> +		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
> +		{ EU_PERF_CNTL2, ctx_flexeuN(2) },
> +		{ EU_PERF_CNTL3, ctx_flexeuN(3) },
> +		{ EU_PERF_CNTL4, ctx_flexeuN(4) },
> +		{ EU_PERF_CNTL5, ctx_flexeuN(5) },
> +		{ EU_PERF_CNTL6, ctx_flexeuN(6) },
> +	};
> +#undef ctx_flexeuN
> +	struct intel_engine_cs *engine;
>   	struct i915_gem_context *ctx;
> -	struct i915_request *rq;
> -	int ret;
> +	enum intel_engine_id id;
> +	int err;
> +	int i;
>   
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +	for (i = 2; i < ARRAY_SIZE(regs); i++)
> +		regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
>   
>   	/*
>   	 * The OA register config is setup through the context image. This image
> @@ -1735,59 +1865,53 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>   	 * We could emit the OA register config through the batch buffer but
>   	 * this might leave small interval of time where the OA unit is
>   	 * configured at an invalid sampling period.
> -	 *
> -	 * So far the best way to work around this issue seems to be draining
> -	 * the GPU from any submitted work.
>   	 */
> -	ret = i915_gem_wait_for_idle(dev_priv,
> -				     I915_WAIT_LOCKED,
> -				     MAX_SCHEDULE_TIMEOUT);
> -	if (ret)
> -		return ret;
> -
> -	/* Update all contexts now that we've stalled the submission. */
> -	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
> +	list_for_each_entry(ctx, &i915->contexts.list, link) {
>   		struct i915_gem_engines_iter it;
>   		struct intel_context *ce;
>   
>   		for_each_gem_engine(ce,
>   				    i915_gem_context_lock_engines(ctx),
>   				    it) {
> -			u32 *regs;
> -
>   			if (ce->engine->class != RENDER_CLASS)
>   				continue;
>   
> -			/* OA settings will be set upon first use */
> -			if (!ce->state)
> -				continue;
> -
> -			regs = i915_gem_object_pin_map(ce->state->obj,
> -						       map_type);
> -			if (IS_ERR(regs)) {
> -				i915_gem_context_unlock_engines(ctx);
> -				return PTR_ERR(regs);
> -			}
> +			err = intel_context_lock_pinned(ce);
> +			if (err)
> +				break;
>   
> -			ce->state->obj->mm.dirty = true;
> -			regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
> +			regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
>   
> -			gen8_update_reg_state_unlocked(ce, regs, oa_config);
> +			/* Otherwise OA settings will be set upon first use */
> +			if (intel_context_is_pinned(ce))
> +				err = gen8_modify_context(ce, regs, ARRAY_SIZE(regs));
>   
> -			i915_gem_object_unpin_map(ce->state->obj);
> +			intel_context_unlock_pinned(ce);
> +			if (err)
> +				break;
>   		}
>   		i915_gem_context_unlock_engines(ctx);
> +		if (err)
> +			return err;
>   	}
>   
>   	/*
> -	 * Apply the configuration by doing one context restore of the edited
> -	 * context image.
> +	 * After updating all other contexts, we need to modify ourselves.
> +	 * If we don't modify the kernel_context, we do not get events while
> +	 * idle.
>   	 */
> -	rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
> -	if (IS_ERR(rq))
> -		return PTR_ERR(rq);
> +	for_each_engine(engine, i915, id) {
> +		struct intel_context *ce = engine->kernel_context;
>   
> -	i915_request_add(rq);
> +		if (engine->class != RENDER_CLASS)
> +			continue;
> +
> +		regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
> +
> +		err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs));
> +		if (err)
> +			return err;
> +	}
>   
>   	return 0;
>   }
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index e367dce2a696..1f0d10bb88c1 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -624,7 +624,9 @@  i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
 	ctx->sched.priority = I915_USER_PRIORITY(prio);
 	ctx->ring_size = PAGE_SIZE;
 
+	/* Isolate the kernel context from prying eyes and sticky fingers */
 	GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
+	list_del_init(&ctx->link);
 
 	return ctx;
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e1ae1399c72b..9cc5374401e1 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1552,9 +1552,12 @@  __execlists_update_reg_state(struct intel_context *ce,
 	regs[CTX_RING_TAIL + 1] = ring->tail;
 
 	/* RPCS */
-	if (engine->class == RENDER_CLASS)
+	if (engine->class == RENDER_CLASS) {
 		regs[CTX_R_PWR_CLK_STATE + 1] =
 			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
+
+		i915_oa_init_reg_state(engine, ce, regs);
+	}
 }
 
 static int
@@ -2966,8 +2969,6 @@  static void execlists_init_reg_state(u32 *regs,
 	if (rcs) {
 		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
 		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
-
-		i915_oa_init_reg_state(engine, ce, regs);
 	}
 
 	regs[CTX_END] = MI_BATCH_BUFFER_END;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 357e63beb373..8353589ee31b 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1630,6 +1630,27 @@  static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
 				      ~GT_NOA_ENABLE));
 }
 
+static u32 oa_config_flex_reg(const struct i915_oa_config *oa_config,
+			      i915_reg_t reg)
+{
+	u32 mmio = i915_mmio_reg_offset(reg);
+	int i;
+
+	/*
+	 * This arbitrary default will select the 'EU FPU0 Pipeline
+	 * Active' event. In the future it's anticipated that there
+	 * will be an explicit 'No Event' we can select, but not yet...
+	 */
+	if (!oa_config)
+		return 0;
+
+	for (i = 0; i < oa_config->flex_regs_len; i++) {
+		if (i915_mmio_reg_offset(oa_config->flex_regs[i].addr) == mmio)
+			return oa_config->flex_regs[i].value;
+	}
+
+	return 0;
+}
 /*
  * NB: It must always remain pointer safe to run this even if the OA unit
  * has been disabled.
@@ -1663,28 +1684,8 @@  gen8_update_reg_state_unlocked(struct intel_context *ce,
 		GEN8_OA_COUNTER_RESUME);
 
 	for (i = 0; i < ARRAY_SIZE(flex_regs); i++) {
-		u32 state_offset = ctx_flexeu0 + i * 2;
-		u32 mmio = i915_mmio_reg_offset(flex_regs[i]);
-
-		/*
-		 * This arbitrary default will select the 'EU FPU0 Pipeline
-		 * Active' event. In the future it's anticipated that there
-		 * will be an explicit 'No Event' we can select, but not yet...
-		 */
-		u32 value = 0;
-
-		if (oa_config) {
-			u32 j;
-
-			for (j = 0; j < oa_config->flex_regs_len; j++) {
-				if (i915_mmio_reg_offset(oa_config->flex_regs[j].addr) == mmio) {
-					value = oa_config->flex_regs[j].value;
-					break;
-				}
-			}
-		}
-
-		CTX_REG(reg_state, state_offset, flex_regs[i], value);
+		CTX_REG(reg_state, ctx_flexeu0 + i * 2, flex_regs[i],
+			oa_config_flex_reg(oa_config, flex_regs[i]));
 	}
 
 	CTX_REG(reg_state,
@@ -1692,6 +1693,107 @@  gen8_update_reg_state_unlocked(struct intel_context *ce,
 		intel_sseu_make_rpcs(i915, &ce->sseu));
 }
 
+struct flex {
+	i915_reg_t reg;
+	u32 offset;
+	u32 value;
+};
+
+static int
+gen8_store_flex(struct i915_request *rq,
+		struct intel_context *ce,
+		const struct flex *flex, unsigned int count)
+{
+	u32 offset;
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4 * count);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	offset = i915_ggtt_offset(ce->state) + LRC_STATE_PN * PAGE_SIZE;
+	do {
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = offset + (flex->offset + 1) * sizeof(u32);
+		*cs++ = 0;
+		*cs++ = flex->value;
+	} while (flex++, --count);
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int
+gen8_load_flex(struct i915_request *rq,
+	       struct intel_context *ce,
+	       const struct flex *flex, unsigned int count)
+{
+	u32 *cs;
+
+	GEM_BUG_ON(!count || count > 63);
+
+	cs = intel_ring_begin(rq, 2 * count + 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_LOAD_REGISTER_IMM(count);
+	do {
+		*cs++ = i915_mmio_reg_offset(flex->reg);
+		*cs++ = flex->value;
+	} while (flex++, --count);
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int gen8_modify_context(struct intel_context *ce,
+			       const struct flex *flex, unsigned int count)
+{
+	struct i915_request *rq;
+	int err;
+
+	lockdep_assert_held(&ce->pin_mutex);
+
+	rq = i915_request_create(ce->engine->kernel_context);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	/* Serialise with the remote context */
+	err = i915_active_request_set(&ce->ring->timeline->last_request, rq);
+	if (err)
+		goto out_add;
+
+	/* Keep the remote context alive until after we finish editing */
+	err = i915_active_ref(&ce->active, rq->fence.context, rq);
+	if (err)
+		goto out_add;
+
+	err = gen8_store_flex(rq, ce, flex, count);
+
+out_add:
+	i915_request_add(rq);
+	return err;
+}
+
+static int gen8_modify_self(struct intel_context *ce,
+			    const struct flex *flex, unsigned int count)
+{
+	struct i915_request *rq;
+	int err;
+
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	err = gen8_load_flex(rq, ce, flex, count);
+
+	i915_request_add(rq);
+	return err;
+}
+
 /*
  * Manages updating the per-context aspects of the OA stream
  * configuration across all contexts.
@@ -1716,15 +1818,43 @@  gen8_update_reg_state_unlocked(struct intel_context *ce,
  *
  * Note: it's only the RCS/Render context that has any OA state.
  */
-static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
+static int gen8_configure_all_contexts(struct drm_i915_private *i915,
 				       const struct i915_oa_config *oa_config)
 {
-	unsigned int map_type = i915_coherent_map_type(dev_priv);
+	/* The MMIO offsets for Flex EU registers aren't contiguous */
+	const u32 ctx_flexeu0 = i915->perf.oa.ctx_flexeu0_offset;
+#define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N))
+	struct flex regs[] = {
+		{
+			GEN8_R_PWR_CLK_STATE,
+			CTX_R_PWR_CLK_STATE,
+		},
+		{
+			GEN8_OACTXCONTROL,
+			i915->perf.oa.ctx_oactxctrl_offset,
+			((i915->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
+			 (i915->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
+			 GEN8_OA_COUNTER_RESUME)
+		},
+		{ EU_PERF_CNTL0, ctx_flexeuN(0) },
+		{ EU_PERF_CNTL1, ctx_flexeuN(1) },
+		{ EU_PERF_CNTL2, ctx_flexeuN(2) },
+		{ EU_PERF_CNTL3, ctx_flexeuN(3) },
+		{ EU_PERF_CNTL4, ctx_flexeuN(4) },
+		{ EU_PERF_CNTL5, ctx_flexeuN(5) },
+		{ EU_PERF_CNTL6, ctx_flexeuN(6) },
+	};
+#undef ctx_flexeuN
+	struct intel_engine_cs *engine;
 	struct i915_gem_context *ctx;
-	struct i915_request *rq;
-	int ret;
+	enum intel_engine_id id;
+	int err;
+	int i;
 
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	for (i = 2; i < ARRAY_SIZE(regs); i++)
+		regs[i].value = oa_config_flex_reg(oa_config, regs[i].reg);
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
 
 	/*
 	 * The OA register config is setup through the context image. This image
@@ -1735,59 +1865,53 @@  static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	 * We could emit the OA register config through the batch buffer but
 	 * this might leave small interval of time where the OA unit is
 	 * configured at an invalid sampling period.
-	 *
-	 * So far the best way to work around this issue seems to be draining
-	 * the GPU from any submitted work.
 	 */
-	ret = i915_gem_wait_for_idle(dev_priv,
-				     I915_WAIT_LOCKED,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (ret)
-		return ret;
-
-	/* Update all contexts now that we've stalled the submission. */
-	list_for_each_entry(ctx, &dev_priv->contexts.list, link) {
+	list_for_each_entry(ctx, &i915->contexts.list, link) {
 		struct i915_gem_engines_iter it;
 		struct intel_context *ce;
 
 		for_each_gem_engine(ce,
 				    i915_gem_context_lock_engines(ctx),
 				    it) {
-			u32 *regs;
-
 			if (ce->engine->class != RENDER_CLASS)
 				continue;
 
-			/* OA settings will be set upon first use */
-			if (!ce->state)
-				continue;
-
-			regs = i915_gem_object_pin_map(ce->state->obj,
-						       map_type);
-			if (IS_ERR(regs)) {
-				i915_gem_context_unlock_engines(ctx);
-				return PTR_ERR(regs);
-			}
+			err = intel_context_lock_pinned(ce);
+			if (err)
+				break;
 
-			ce->state->obj->mm.dirty = true;
-			regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
+			regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
 
-			gen8_update_reg_state_unlocked(ce, regs, oa_config);
+			/* Otherwise OA settings will be set upon first use */
+			if (intel_context_is_pinned(ce))
+				err = gen8_modify_context(ce, regs, ARRAY_SIZE(regs));
 
-			i915_gem_object_unpin_map(ce->state->obj);
+			intel_context_unlock_pinned(ce);
+			if (err)
+				break;
 		}
 		i915_gem_context_unlock_engines(ctx);
+		if (err)
+			return err;
 	}
 
 	/*
-	 * Apply the configuration by doing one context restore of the edited
-	 * context image.
+	 * After updating all other contexts, we need to modify ourselves.
+	 * If we don't modify the kernel_context, we do not get events while
+	 * idle.
 	 */
-	rq = i915_request_create(dev_priv->engine[RCS0]->kernel_context);
-	if (IS_ERR(rq))
-		return PTR_ERR(rq);
+	for_each_engine(engine, i915, id) {
+		struct intel_context *ce = engine->kernel_context;
 
-	i915_request_add(rq);
+		if (engine->class != RENDER_CLASS)
+			continue;
+
+		regs[0].value = intel_sseu_make_rpcs(i915, &ce->sseu);
+
+		err = gen8_modify_self(ce, regs, ARRAY_SIZE(regs));
+		if (err)
+			return err;
+	}
 
 	return 0;
 }